Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
verl_mixtral_8x7B
Commits
f87b35b2
Commit
f87b35b2
authored
Apr 17, 2025
by
jerrrrry
Browse files
Initial commit
parents
Pipeline
#2648
failed with stages
in 0 seconds
Changes
363
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4575 additions
and
0 deletions
+4575
-0
verl/single_controller/ray/base.py
verl/single_controller/ray/base.py
+500
-0
verl/single_controller/ray/megatron.py
verl/single_controller/ray/megatron.py
+62
-0
verl/third_party/__init__.py
verl/third_party/__init__.py
+13
-0
verl/third_party/sglang/__init__.py
verl/third_party/sglang/__init__.py
+27
-0
verl/third_party/sglang/parallel_state.py
verl/third_party/sglang/parallel_state.py
+328
-0
verl/third_party/vllm/__init__.py
verl/third_party/vllm/__init__.py
+66
-0
verl/third_party/vllm/vllm_v_0_3_1/__init__.py
verl/third_party/vllm/vllm_v_0_3_1/__init__.py
+13
-0
verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py
verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py
+228
-0
verl/third_party/vllm/vllm_v_0_3_1/config.py
verl/third_party/vllm/vllm_v_0_3_1/config.py
+577
-0
verl/third_party/vllm/vllm_v_0_3_1/llm.py
verl/third_party/vllm/vllm_v_0_3_1/llm.py
+275
-0
verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py
verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py
+765
-0
verl/third_party/vllm/vllm_v_0_3_1/model_loader.py
verl/third_party/vllm/vllm_v_0_3_1/model_loader.py
+275
-0
verl/third_party/vllm/vllm_v_0_3_1/model_runner.py
verl/third_party/vllm/vllm_v_0_3_1/model_runner.py
+285
-0
verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py
verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py
+147
-0
verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py
verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py
+72
-0
verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py
verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py
+95
-0
verl/third_party/vllm/vllm_v_0_3_1/worker.py
verl/third_party/vllm/vllm_v_0_3_1/worker.py
+314
-0
verl/third_party/vllm/vllm_v_0_4_2/__init__.py
verl/third_party/vllm/vllm_v_0_4_2/__init__.py
+13
-0
verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py
verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py
+320
-0
verl/third_party/vllm/vllm_v_0_4_2/config.py
verl/third_party/vllm/vllm_v_0_4_2/config.py
+200
-0
No files found.
Too many changes to show.
To preserve performance only
363 of 363+
files are displayed.
Plain diff
Email patch
verl/single_controller/ray/base.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
time
from
typing
import
Dict
,
List
,
Any
,
Tuple
,
Optional
import
ray
from
ray.util
import
list_named_actors
from
ray.util.placement_group
import
placement_group
,
PlacementGroup
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
,
NodeAffinitySchedulingStrategy
from
ray.experimental.state.api
import
get_actor
from
verl.single_controller.base
import
WorkerGroup
,
ResourcePool
,
ClassWithInitArgs
,
Worker
__all__
=
[
'Worker'
]
def
get_random_string
(
length
:
int
)
->
str
:
import
random
import
string
letters_digits
=
string
.
ascii_letters
+
string
.
digits
return
''
.
join
(
random
.
choice
(
letters_digits
)
for
_
in
range
(
length
))
def
func_generator
(
self
,
method_name
,
dispatch_fn
,
collect_fn
,
execute_fn
,
blocking
):
def
func
(
*
args
,
**
kwargs
):
args
,
kwargs
=
dispatch_fn
(
self
,
*
args
,
**
kwargs
)
output
=
execute_fn
(
method_name
,
*
args
,
**
kwargs
)
if
blocking
:
output
=
ray
.
get
(
output
)
output
=
collect_fn
(
self
,
output
)
return
output
return
func
def
sort_placement_group_by_node_ip
(
pgs
:
List
[
PlacementGroup
])
->
List
[
PlacementGroup
]:
"""
Sort the placement groups by node ip, all bundles in a single placement group should be on the same node.
FSDPCheckpointManager saves sharded model states and optimizer states in local storage, which requires RANK
to be consistent across nodes when resume from checkpoint.
With this function, if there's only one resource pool and there's no node change, RANK should be consistent
across nodes in multiple ray jobs, even if the whole ray cluster is restarted.
"""
node_ip
=
{
node
[
"NodeID"
]:
node
[
"NodeManagerAddress"
]
for
node
in
ray
.
nodes
()}
pg_ip
=
{}
for
pg
in
pgs
:
specs
=
ray
.
_private
.
state
.
state
.
placement_group_table
(
pg
.
id
)
# all bunles should be on the same node
node_id
=
specs
[
"bundles_to_node_id"
][
0
]
pg_ip
[
pg
.
id
]
=
node_ip
[
node_id
]
return
sorted
(
pgs
,
key
=
lambda
pg
:
pg_ip
[
pg
.
id
])
class
RayResourcePool
(
ResourcePool
):
def
__init__
(
self
,
process_on_nodes
:
Optional
[
List
[
int
]]
=
None
,
use_gpu
:
bool
=
True
,
name_prefix
:
str
=
""
,
max_colocate_count
:
int
=
10
,
detached
=
False
)
->
None
:
super
().
__init__
(
process_on_nodes
,
max_colocate_count
)
self
.
use_gpu
=
use_gpu
# print(f"in RayProcessDispatchConfiguration: name_prefix = {name_prefix}")
self
.
name_prefix
=
name_prefix
self
.
pgs
=
None
self
.
detached
=
detached
def
get_placement_groups
(
self
,
strategy
=
"STRICT_PACK"
,
name
=
None
):
if
self
.
pgs
is
not
None
:
return
self
.
pgs
pg_name_prefix
=
name
if
name
else
\
f
"
{
self
.
name_prefix
}
verl_group_
{
'_'
.
join
([
str
(
count
)
for
count
in
self
.
_store
])
}
:"
# print(f"pg_name_prefix = {pg_name_prefix}")
pg_scheme
=
[[{
"CPU"
:
self
.
max_colocate_count
,
"GPU"
:
1
}
if
self
.
use_gpu
else
{
"CPU"
:
self
.
max_colocate_count
}
for
_
in
range
(
process_count
)]
for
process_count
in
self
.
_store
]
lifetime
=
'detached'
if
self
.
detached
else
None
pgs
=
[
placement_group
(
bundles
=
bundles
,
strategy
=
strategy
,
name
=
pg_name_prefix
+
str
(
idx
),
lifetime
=
lifetime
)
for
idx
,
bundles
in
enumerate
(
pg_scheme
)
]
ray
.
get
([
pg
.
ready
()
for
pg
in
pgs
])
self
.
pgs
=
pgs
return
pgs
def
extract_pg_from_exist
(
resource_pools
:
Dict
[
str
,
RayResourcePool
],
src_role_names
:
List
[
str
],
resource_pool
:
RayResourcePool
)
->
List
:
src_pgs
=
[
pg
for
role_name
,
resource_pool
in
resource_pools
.
items
()
for
pg
in
resource_pool
.
get_placement_groups
()
if
role_name
in
src_role_names
]
sorted_src_pgs
=
sorted
(
src_pgs
,
key
=
lambda
pg
:
pg
.
bundle_count
,
reverse
=
True
)
sorted_process_on_nodes
=
sorted
([(
val
,
idx
)
for
idx
,
val
in
enumerate
(
resource_pool
.
store
)],
reverse
=
True
)
unsorted_pgs
:
List
[
Tuple
[
int
,
PlacementGroup
]]
=
[]
searching_idx
=
0
for
request_process
,
original_idx
in
sorted_process_on_nodes
:
assert
searching_idx
<
len
(
sorted_src_pgs
),
f
"no enough nodes for request: searching
{
searching_idx
}
th node"
assert
request_process
<=
sorted_src_pgs
[
searching_idx
].
bundle_count
,
\
f
"requesting
{
request_process
}
processes, bundle count cannot satisfy"
unsorted_pgs
.
append
((
original_idx
,
sorted_src_pgs
[
searching_idx
]))
searching_idx
+=
1
return
[
pg
for
_
,
pg
in
sorted
(
unsorted_pgs
)]
def
merge_resource_pool
(
rp1
:
RayResourcePool
,
rp2
:
RayResourcePool
)
->
RayResourcePool
:
assert
rp1
.
use_gpu
==
rp2
.
use_gpu
,
'Both RayResourcePool must either use_gpu or not'
assert
rp1
.
max_colocate_count
==
rp2
.
max_colocate_count
,
'Both RayResourcePool must has the same max_colocate_count'
assert
rp1
.
n_gpus_per_node
==
rp2
.
n_gpus_per_node
,
'Both RayResourcePool must has the same n_gpus_per_node'
assert
rp1
.
detached
==
rp2
.
detached
,
'Detached ResourcePool cannot be merged with non-detached ResourcePool'
new_store
=
rp1
.
store
+
rp2
.
store
merged
=
RayResourcePool
(
new_store
,
rp1
.
use_gpu
,
f
"
{
rp1
.
name_prefix
}
_
{
rp2
.
name_prefix
}
"
)
merged
.
pgs
=
rp1
.
get_placement_groups
()
+
rp2
.
get_placement_groups
()
return
merged
class
RayClassWithInitArgs
(
ClassWithInitArgs
):
def
__init__
(
self
,
cls
,
*
args
,
**
kwargs
)
->
None
:
# self._options = kwargs.pop('options', dict())
super
().
__init__
(
cls
,
*
args
,
**
kwargs
)
self
.
_options
=
{}
self
.
_additional_resource
=
{}
def
set_additional_resource
(
self
,
additional_resource
):
self
.
_additional_resource
=
additional_resource
def
update_options
(
self
,
options
:
Dict
):
self
.
_options
.
update
(
options
)
def
__call__
(
self
,
placement_group
,
placement_group_bundle_idx
,
use_gpu
:
bool
=
True
,
num_gpus
=
1
,
sharing_with
=
None
)
->
Any
:
if
sharing_with
is
not
None
:
target_node_id
=
ray
.
get
(
sharing_with
.
get_node_id
.
remote
())
cuda_visible_devices
=
ray
.
get
(
sharing_with
.
get_cuda_visible_devices
.
remote
())
options
=
{
"scheduling_strategy"
:
NodeAffinitySchedulingStrategy
(
node_id
=
target_node_id
,
soft
=
False
)}
return
self
.
cls
.
options
(
**
options
).
remote
(
*
self
.
args
,
cuda_visible_devices
=
cuda_visible_devices
,
**
self
.
kwargs
)
options
=
{
"scheduling_strategy"
:
PlacementGroupSchedulingStrategy
(
placement_group
=
placement_group
,
placement_group_bundle_index
=
placement_group_bundle_idx
)
}
options
.
update
(
self
.
_options
)
if
use_gpu
:
options
[
"num_gpus"
]
=
num_gpus
if
len
(
self
.
_additional_resource
)
>
1
:
for
k
,
v
in
self
.
_additional_resource
.
items
():
options
[
k
]
=
v
# print("cls:", self.cls)
# print("args: ", self.args)
# print("kwargs: ", self.kwargs)
return
self
.
cls
.
options
(
**
options
).
remote
(
*
self
.
args
,
**
self
.
kwargs
)
class
RayWorkerGroup
(
WorkerGroup
):
def
__init__
(
self
,
resource_pool
:
RayResourcePool
=
None
,
ray_cls_with_init
:
RayClassWithInitArgs
=
None
,
bin_pack
:
bool
=
True
,
name_prefix
:
str
=
None
,
detached
=
False
,
worker_names
=
None
,
ray_wait_register_center_timeout
:
int
=
300
,
**
kwargs
)
->
None
:
super
().
__init__
(
resource_pool
=
resource_pool
,
**
kwargs
)
self
.
ray_cls_with_init
=
ray_cls_with_init
self
.
name_prefix
=
get_random_string
(
length
=
6
)
if
name_prefix
is
None
else
name_prefix
self
.
_ray_wait_register_center_timeout
=
ray_wait_register_center_timeout
if
worker_names
is
not
None
:
assert
self
.
_is_init_with_detached_workers
self
.
_worker_names
=
worker_names
if
self
.
_is_init_with_detached_workers
:
self
.
_init_with_detached_workers
(
worker_names
=
worker_names
)
else
:
self
.
_init_with_resource_pool
(
resource_pool
=
resource_pool
,
ray_cls_with_init
=
ray_cls_with_init
,
bin_pack
=
bin_pack
,
detached
=
detached
)
if
ray_cls_with_init
is
not
None
:
self
.
_bind_worker_method
(
self
.
ray_cls_with_init
.
cls
,
func_generator
)
def
_is_worker_alive
(
self
,
worker
:
ray
.
actor
.
ActorHandle
):
worker_state_dict
=
get_actor
(
worker
.
_actor_id
.
hex
())
return
worker_state_dict
.
get
(
"state"
,
"undefined"
)
==
"ALIVE"
if
worker_state_dict
is
not
None
else
False
def
_init_with_detached_workers
(
self
,
worker_names
):
workers
=
[
ray
.
get_actor
(
name
=
name
)
for
name
in
worker_names
]
self
.
_workers
=
workers
self
.
_world_size
=
len
(
worker_names
)
def
_init_with_resource_pool
(
self
,
resource_pool
,
ray_cls_with_init
,
bin_pack
,
detached
):
use_gpu
=
resource_pool
.
use_gpu
strategy
=
"PACK"
if
bin_pack
:
strategy
=
"STRICT_PACK"
pgs
=
resource_pool
.
get_placement_groups
(
strategy
=
strategy
)
world_size
=
resource_pool
.
world_size
self
.
_world_size
=
world_size
# cia.add_kwarg("_world_size", world_size)
num_gpus
=
1
/
resource_pool
.
max_colocate_count
rank
=
-
1
local_world_size
=
resource_pool
.
store
[
0
]
for
pg_idx
,
pg
in
enumerate
(
sort_placement_group_by_node_ip
(
pgs
)):
assert
local_world_size
<=
pg
.
bundle_count
,
\
f
"when generating for
{
self
.
name_prefix
}
, for the "
for
local_rank
in
range
(
local_world_size
):
rank
+=
1
# we pass in environment variable at option so that Worker can use environment variable to set
env_vars
=
{
'WORLD_SIZE'
:
str
(
world_size
),
'RANK'
:
str
(
rank
),
'WG_PREFIX'
:
self
.
name_prefix
,
'WG_BACKEND'
:
'ray'
,
'RAY_LOCAL_WORLD_SIZE'
:
str
(
local_world_size
),
'RAY_LOCAL_RANK'
:
str
(
local_rank
),
}
if
rank
!=
0
:
env_vars
[
'MASTER_ADDR'
]
=
self
.
_master_addr
env_vars
[
'MASTER_PORT'
]
=
self
.
_master_port
import
re
cia_name
=
type
(
ray_cls_with_init
.
cls
).
__name__
match
=
re
.
search
(
r
"ActorClass\(([^)]+)\)"
,
cia_name
)
# ray.remote(Obj) -> "ActorClass(Obj)"
cia_name
=
match
.
group
(
1
)
if
match
else
cia_name
# "ActorClass(Obj)" -> "Obj"
name
=
f
"
{
self
.
name_prefix
}{
cia_name
}
_
{
pg_idx
}
:
{
local_rank
}
"
# e.g. Worker_2:5
ray_cls_with_init
.
update_options
({
'runtime_env'
:
{
'env_vars'
:
env_vars
},
'name'
:
name
})
if
detached
:
ray_cls_with_init
.
update_options
({
'lifetime'
:
'detached'
})
# create a worker
worker
=
ray_cls_with_init
(
placement_group
=
pg
,
placement_group_bundle_idx
=
local_rank
,
use_gpu
=
use_gpu
,
num_gpus
=
num_gpus
)
self
.
_workers
.
append
(
worker
)
self
.
_worker_names
.
append
(
name
)
if
rank
==
0
:
register_center_actor
=
None
actor_name
=
f
"
{
self
.
name_prefix
}
_register_center"
start_time
=
time
.
time
()
while
time
.
time
()
-
start_time
<
self
.
_ray_wait_register_center_timeout
:
if
actor_name
in
list_named_actors
():
register_center_actor
=
ray
.
get_actor
(
actor_name
)
break
elapsed
=
int
(
time
.
time
()
-
start_time
)
if
elapsed
%
30
==
0
:
logging
.
warning
(
f
"Waiting for register center actor
{
actor_name
}
to be ready. "
f
"Elapsed time:
{
elapsed
}
seconds out of
{
self
.
_ray_wait_register_center_timeout
}
seconds."
)
time
.
sleep
(
1
)
if
register_center_actor
is
None
:
raise
TimeoutError
(
f
"Failed to get register_center_actor
{
actor_name
}
in
{
list_named_actors
(
all_namespaces
=
True
)
}
"
f
"for
{
self
.
_ray_wait_register_center_timeout
}
seconds. "
"Ensure that any lingering Ray resources from previous runs are cleaned up (e.g., by restarting the Ray cluster), "
"or adjust the waiting time by modifying the config `trainer.ray_wait_register_center_timeout`."
)
rank_zero_info
=
ray
.
get
(
register_center_actor
.
get_rank_zero_info
.
remote
())
self
.
_master_addr
,
self
.
_master_port
=
rank_zero_info
[
'MASTER_ADDR'
],
rank_zero_info
[
'MASTER_PORT'
]
# print(f"rank_zero_info: {rank_zero_info}")
# print(f"master_addr: {self._master_addr}, master_port: {self._master_port}")
@
property
def
worker_names
(
self
):
return
self
.
_worker_names
@
classmethod
def
from_detached
(
cls
,
worker_names
=
None
,
ray_cls_with_init
=
None
):
worker_group
=
cls
(
resource_pool
=
None
,
ray_cls_with_init
=
ray_cls_with_init
,
name_prefix
=
None
,
worker_names
=
worker_names
)
return
worker_group
def
spawn
(
self
,
prefix_set
):
"""
spawn to a dictionary of worker groups, each with a subset of method with prefix.
"""
def
_rebind_actor_methods
(
worker_group
,
actor_name
):
"""
bind the method with actor_prefix to its original name
"""
prefix
:
str
=
actor_name
+
'_'
for
method_name
in
dir
(
worker_group
):
if
method_name
.
startswith
(
prefix
):
# only valid when Python >= 3.9
original_method_name
=
method_name
.
removeprefix
(
prefix
)
method
=
getattr
(
worker_group
,
method_name
)
setattr
(
worker_group
,
original_method_name
,
method
)
new_worker_group_dict
=
{}
for
prefix
in
prefix_set
:
new_worker_group
=
self
.
from_detached
(
worker_names
=
self
.
_worker_names
,
ray_cls_with_init
=
self
.
ray_cls_with_init
)
_rebind_actor_methods
(
new_worker_group
,
prefix
)
new_worker_group_dict
[
prefix
]
=
new_worker_group
return
new_worker_group_dict
def
execute_rank_zero_sync
(
self
,
method_name
:
str
,
*
args
,
**
kwargs
):
return
ray
.
get
(
self
.
execute_rank_zero_async
(
method_name
,
*
args
,
**
kwargs
))
def
execute_rank_zero_async
(
self
,
method_name
:
str
,
*
args
,
**
kwargs
):
remote_call
=
getattr
(
self
.
_workers
[
0
],
method_name
)
return
remote_call
.
remote
(
*
args
,
**
kwargs
)
def
execute_rank_zero
(
self
,
method_name
:
str
,
*
args
,
**
kwargs
):
return
self
.
execute_rank_zero_async
(
method_name
,
*
args
,
**
kwargs
)
def
execute_all
(
self
,
method_name
:
str
,
*
args
,
**
kwargs
):
return
self
.
execute_all_async
(
method_name
,
*
args
,
**
kwargs
)
def
execute_all_sync
(
self
,
method_name
:
str
,
*
args
,
**
kwargs
):
return
ray
.
get
(
self
.
execute_all_async
(
method_name
,
*
args
,
**
kwargs
))
def
execute_all_async
(
self
,
method_name
:
str
,
*
args
,
**
kwargs
):
# Here, we assume that if all arguments in args and kwargs are lists, and their lengths match len(self._workers),
# we'll distribute each element in these lists to the corresponding worker
# print(f"execute_all_async: method {method_name}({args}, {kwargs})")
length
=
len
(
self
.
_workers
)
if
all
(
isinstance
(
arg
,
list
)
for
arg
in
args
)
and
all
(
isinstance
(
kwarg
,
list
)
for
kwarg
in
kwargs
.
values
()):
if
all
(
len
(
arg
)
==
length
for
arg
in
args
)
and
all
(
len
(
kwarg
)
==
length
for
kwarg
in
kwargs
.
values
()):
# print(f"splitting args and kwargs into {length} shards")
result
=
[]
for
i
in
range
(
length
):
sliced_args
=
tuple
(
arg
[
i
]
for
arg
in
args
)
sliced_kwargs
=
{
k
:
v
[
i
]
for
k
,
v
in
kwargs
.
items
()}
remote_call
=
getattr
(
self
.
_workers
[
i
],
method_name
)
result
.
append
(
remote_call
.
remote
(
*
sliced_args
,
**
sliced_kwargs
))
return
result
return
[
getattr
(
worker
,
method_name
).
remote
(
*
args
,
**
kwargs
)
for
worker
in
self
.
_workers
]
@
property
def
master_address
(
self
):
return
self
.
_master_addr
@
property
def
master_port
(
self
):
return
self
.
_master_port
@
property
def
workers
(
self
):
return
self
.
_workers
@
property
def
world_size
(
self
):
return
self
.
_world_size
"""
Utilities that enables creating workers inside the same ray.Actor,
with code written in separate ray.Actors.
"""
from
unittest.mock
import
patch
from
verl.single_controller.base.decorator
import
MAGIC_ATTR
import
os
def
_bind_workers_method_to_parent
(
cls
,
key
,
user_defined_cls
):
"""
Binds the methods of each worker to the WorkerDict.
Note that we only bind public methods that are decorated by register
"""
for
method_name
in
dir
(
user_defined_cls
):
try
:
method
=
getattr
(
user_defined_cls
,
method_name
)
assert
callable
(
method
),
f
"
{
method_name
}
in
{
user_defined_cls
}
is not callable"
except
Exception
as
e
:
# if it is a property, it will fail because Class doesn't have instance property
continue
if
hasattr
(
method
,
MAGIC_ATTR
):
def
generate_function
(
name
):
def
func
(
self
,
*
args
,
**
kwargs
):
# dispatch to the actual worker
return
getattr
(
self
.
worker_dict
[
key
],
name
)(
*
args
,
**
kwargs
)
return
func
func
=
generate_function
(
method_name
)
# pass MAGIC_ATTR for outer worker group
setattr
(
func
,
MAGIC_ATTR
,
getattr
(
method
,
MAGIC_ATTR
))
try
:
method_name_with_prefix
=
key
+
'_'
+
method_name
setattr
(
cls
,
method_name_with_prefix
,
func
)
# print(f'Binding {method_name_with_prefix}')
except
Exception
as
e
:
raise
ValueError
(
f
'Fail to set method_name
{
method_name
}
'
)
def
_unwrap_ray_remote
(
cls
):
if
hasattr
(
cls
,
'__ray_actor_class__'
):
cls
=
cls
.
__ray_actor_class__
return
cls
def
create_colocated_worker_cls
(
class_dict
:
dict
[
str
,
RayClassWithInitArgs
]):
"""
This function should return a class instance that delegates the calls to every
cls in cls_dict
"""
cls_dict
=
{}
init_args_dict
=
{}
worker_cls
=
None
for
key
,
cls
in
class_dict
.
items
():
if
worker_cls
==
None
:
worker_cls
=
cls
.
cls
.
__ray_actor_class__
.
__base__
else
:
assert
worker_cls
==
cls
.
cls
.
__ray_actor_class__
.
__base__
,
\
'the worker class should be the same when share the same process'
cls_dict
[
key
]
=
cls
.
cls
init_args_dict
[
key
]
=
{
'args'
:
cls
.
args
,
'kwargs'
:
cls
.
kwargs
}
assert
cls_dict
.
keys
()
==
init_args_dict
.
keys
()
# TODO: create a class with customizable name
class
WorkerDict
(
worker_cls
):
def
__init__
(
self
):
super
().
__init__
()
self
.
worker_dict
=
{}
for
key
,
user_defined_cls
in
cls_dict
.
items
():
user_defined_cls
=
_unwrap_ray_remote
(
user_defined_cls
)
# directly instantiate the class without remote
# in worker class, e.g. <verl.single_controller.base.worker.Worker> when DISABLE_WORKER_INIT == 1 it will return immediately
with
patch
.
dict
(
os
.
environ
,
{
'DISABLE_WORKER_INIT'
:
'1'
}):
self
.
worker_dict
[
key
]
=
user_defined_cls
(
*
init_args_dict
[
key
].
get
(
'args'
,
()),
**
init_args_dict
[
key
].
get
(
'kwargs'
,
{}))
# now monkey-patch the methods from inner class to WorkerDict
for
key
,
user_defined_cls
in
cls_dict
.
items
():
user_defined_cls
=
_unwrap_ray_remote
(
user_defined_cls
)
_bind_workers_method_to_parent
(
WorkerDict
,
key
,
user_defined_cls
)
remote_cls
=
ray
.
remote
(
WorkerDict
)
remote_cls
=
RayClassWithInitArgs
(
cls
=
remote_cls
)
return
remote_cls
verl/single_controller/ray/megatron.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
Dict
,
Optional
import
ray
from
.base
import
RayWorkerGroup
,
RayResourcePool
,
RayClassWithInitArgs
from
verl.single_controller.base.megatron.worker
import
DistRankInfo
,
DistGlobalInfo
from
verl.single_controller.base.megatron.worker_group
import
MegatronWorkerGroup
# NOTE(sgm): for open-source megatron-core
class
NVMegatronRayWorkerGroup
(
RayWorkerGroup
,
MegatronWorkerGroup
):
"""
MegatronWorkerGroup will query each worker of its megatron rank info and store it inside the WorkerGroup
so that the dispatcher can use it to dispatch data.
"""
def
__init__
(
self
,
resource_pool
:
RayResourcePool
,
ray_cls_with_init
:
RayClassWithInitArgs
,
**
kwargs
):
super
().
__init__
(
resource_pool
=
resource_pool
,
ray_cls_with_init
=
ray_cls_with_init
,
**
kwargs
)
self
.
_megatron_rank_info
:
DistRankInfo
=
self
.
execute_all_sync
(
method_name
=
'get_megatron_rank_info'
)
self
.
_megatron_global_info
:
DistGlobalInfo
=
ray
.
get
(
self
.
execute_rank_zero_async
(
method_name
=
'get_megatron_global_info'
))
class
MegatronRayWorkerGroup
(
RayWorkerGroup
,
MegatronWorkerGroup
):
"""
MegatronWorkerGroup will query each worker of its megatron rank info and store it inside the WorkerGroup
so that the dispatcher can use it to dispatch data.
"""
def
__init__
(
self
,
resource_pool
:
RayResourcePool
,
ray_cls_with_init
:
RayClassWithInitArgs
,
default_megatron_kwargs
:
Dict
=
None
,
**
kwargs
):
super
().
__init__
(
resource_pool
=
resource_pool
,
ray_cls_with_init
=
ray_cls_with_init
,
default_megatron_kwargs
=
default_megatron_kwargs
,
**
kwargs
)
self
.
init_megatron
(
default_megatron_kwargs
=
default_megatron_kwargs
)
self
.
_megatron_rank_info
:
DistRankInfo
=
self
.
execute_all_sync
(
method_name
=
'get_megatron_rank_info'
)
self
.
_megatron_global_info
:
DistGlobalInfo
=
ray
.
get
(
self
.
execute_rank_zero_async
(
method_name
=
'get_megatron_global_info'
))
def
init_megatron
(
self
,
default_megatron_kwargs
:
Optional
[
Dict
]
=
None
):
# after super, we will call init of each worker
if
not
self
.
_is_init_with_detached_workers
:
# only init_megatron if the WorkerGroup is created from scratch
self
.
execute_all_sync
(
method_name
=
'init_megatron'
,
default_megatron_kwargs
=
default_megatron_kwargs
)
verl/third_party/__init__.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
verl/third_party/sglang/__init__.py
0 → 100644
View file @
f87b35b2
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
verl/third_party/sglang/parallel_state.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The SGlang team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Model and data parallel groups."""
import
os
from
typing
import
Optional
import
torch
import
torch.distributed
import
sglang.srt.distributed.parallel_state
as
ps
from
sglang.srt.distributed.parallel_state
import
(
get_pp_group
,
get_world_group
,
init_distributed_environment
,
init_model_parallel_group
,
)
"""
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
- We assume the Megatron tp+dp+pp world is already established before calling this function.
"""
# Device mesh for using DTensor
_DEVICE_MESH
=
None
# Tensor model parallel group that the current rank belongs to.
_TP
=
None
# Pipeline model parallel group that the current rank belongs to.
_PP
=
None
# This method is for initializing the ParallelGroup when using HybridEngine
# NOTE(linjunrong): this function is for megatron
def
initialize_parallel_state
(
distributed_init_method
:
str
=
"env://"
,
backend
:
str
=
"nccl"
,
tensor_model_parallel_size
:
int
=
1
,
num_tp_per_train_tp
:
int
=
1
,
pipeline_model_parallel_size
:
int
=
1
,
):
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os
.
environ
[
"TORCH_NCCL_AVOID_RECORD_STREAMS"
]
=
"1"
# NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
rank
=
int
(
os
.
getenv
(
"RANK"
,
"-1"
))
local_rank
=
int
(
os
.
getenv
(
"LOCAL_RANK"
,
"0"
))
# Use the world_size set by TORCHRUN
world_size
=
int
(
os
.
getenv
(
"WORLD_SIZE"
,
"-1"
))
assert
world_size
!=
-
1
,
"The world_size is set to -1, not initialized by TORCHRUN"
init_distributed_environment
(
world_size
,
rank
,
distributed_init_method
,
local_rank
,
backend
)
if
torch
.
distributed
.
get_world_size
()
>
1
:
# NOTE: build a sepearate inference group with infer tp & micro dp
initialize_model_parallel_for_sglang
(
tensor_model_parallel_size
=
tensor_model_parallel_size
,
num_tensor_model_parallel_groups_per_train_tp
=
num_tp_per_train_tp
,
)
else
:
initialize_model_parallel
(
tensor_model_parallel_size
,
pipeline_model_parallel_size
,
backend
)
# NOTE(linjunrong): After init SGLang rollout using class EngineFragment, user should always remember to call
# this function to sync the _TP, _PP define at the beginning of this file. Otherwise, only the conterparts
# inside sglang.srt.distributed are init as ProcessGroup, the symbols defined in this file remain as None.
# It could be weird to maintain two _TP and _PP, I follow the same way to maintain an extra ones for
# veRL itself as how it was done in verl.third_party.vllm.parallel_state. Note that the process is a little
# bit different
def
ensure_model_parallel_initialized
(
tensor_model_parallel_size
:
int
,
pipeline_model_parallel_size
:
int
=
1
,
backend
:
Optional
[
str
]
=
None
,
)
->
None
:
"""Helper to initialize model parallel groups if they are not initialized,
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
values if the model parallel groups are initialized.
"""
# get the backend of _DEVICE_WORLD_GROUP
backend
=
backend
or
torch
.
distributed
.
get_backend
(
get_world_group
().
device_group
)
if
not
model_parallel_is_initialized
():
initialize_model_parallel
(
tensor_model_parallel_size
,
pipeline_model_parallel_size
,
backend
)
return
assert
get_tensor_model_parallel_world_size
()
==
tensor_model_parallel_size
,
(
"tensor parallel group already initialized, but of unexpected size: "
f
"
{
get_tensor_model_parallel_world_size
()
=
}
vs. "
f
"
{
tensor_model_parallel_size
=
}
"
)
pp_world_size
=
get_pp_group
().
world_size
assert
pp_world_size
==
pipeline_model_parallel_size
,
(
"pipeline parallel group already initialized, but of unexpected size: "
f
"
{
pp_world_size
=
}
vs. "
f
"
{
pipeline_model_parallel_size
=
}
"
)
# TODO(sgm): deviate from the v0.5.4, not pp now
# NOTE(linjunrong): the SGLang version using _TP instead of ps._TP
def
model_parallel_is_initialized
():
"""Check if tensor and pipeline parallel groups are initialized."""
return
_TP
is
not
None
# and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
def
initialize_model_parallel_for_sglang
(
tensor_model_parallel_size
:
int
,
num_tensor_model_parallel_groups_per_train_tp
:
int
=
1
,
pipeline_model_parallel_size
:
int
=
1
,
)
->
None
:
pass
# Get world size and rank. Ensure some consistencies.
assert
torch
.
distributed
.
is_initialized
()
assert
isinstance
(
tensor_model_parallel_size
,
int
)
# assert num_tensor_model_parallel_groups_per_train_tp == 1 and not different_tp_group
# assert num_tensor_model_parallel_groups_per_train_tp > 1 and different_tp_group
# Build the tensor model-parallel groups.
assert
ps
.
_TP
is
None
,
"tensor model parallel group is already initialized"
global
_TP
world_size
:
int
=
torch
.
distributed
.
get_world_size
()
rank
=
torch
.
distributed
.
get_rank
()
backend
=
torch
.
distributed
.
get_backend
()
num_tensor_model_parallel_groups
=
world_size
//
tensor_model_parallel_size
if
num_tensor_model_parallel_groups_per_train_tp
==
1
:
# if tensor_model_parallel_size == train_tensor_parallel_size:
# using the same tp group as Megatron/vllm
assert
_TP
is
None
,
"tensor model parallel group is already initialized"
group_ranks
=
[]
for
i
in
range
(
num_tensor_model_parallel_groups
):
ranks
=
range
(
i
*
tensor_model_parallel_size
,
(
i
+
1
)
*
tensor_model_parallel_size
)
group_ranks
.
append
(
ranks
)
_TP
=
init_model_parallel_group
(
group_ranks
=
group_ranks
,
local_rank
=
get_world_group
().
local_rank
,
backend
=
backend
,
use_custom_allreduce
=
False
,
# TODO: check why True is not work in Ray trainer
use_message_queue_broadcaster
=
True
,
)
ps
.
_TP
=
_TP
# _MICRO_DATA_PARALLEL_GROUP is move to hybrid engine
else
:
# initialize a micro_dp group and a tp group
# assume training tp=4, infer tp=2, then, weight is partitioned as
# [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
# Build the inference tp groups
# train_tp = train_tensor_parallel_size
train_tp
=
num_tensor_model_parallel_groups_per_train_tp
*
tensor_model_parallel_size
# num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
assert
_TP
is
None
,
"tensor model parallel group is already initialized"
group_ranks
=
[]
for
i
in
range
(
num_tensor_model_parallel_groups
//
num_tensor_model_parallel_groups_per_train_tp
):
start
=
train_tp
*
i
end
=
train_tp
*
(
i
+
1
)
for
j
in
range
(
num_tensor_model_parallel_groups_per_train_tp
):
ranks
=
list
(
range
(
start
,
end
,
num_tensor_model_parallel_groups_per_train_tp
))
for
i
in
range
(
len
(
ranks
)):
ranks
[
i
]
+=
j
group_ranks
.
append
(
ranks
)
_TP
=
init_model_parallel_group
(
group_ranks
=
group_ranks
,
local_rank
=
get_world_group
().
local_rank
,
backend
=
backend
,
use_custom_allreduce
=
False
,
# TODO: check why True is not work in Ray trainer
use_message_queue_broadcaster
=
True
,
)
ps
.
_TP
=
_TP
# Build the pipeline model-parallel groups.
# global _PIPELINE_MODEL_PARALLEL_GROUP
# global _PIPELINE_GLOBAL_RANKS
# assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
# ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
# ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
# TODO: init using device mesh (not support hybrid engine now)
# Build the pipeline model-parallel groups.
num_pipeline_model_parallel_groups
:
int
=
world_size
//
pipeline_model_parallel_size
global
_PP
assert
_PP
is
None
,
"pipeline model parallel group is already initialized"
group_ranks
=
[]
for
i
in
range
(
num_pipeline_model_parallel_groups
):
ranks
=
list
(
range
(
i
,
world_size
,
num_pipeline_model_parallel_groups
))
group_ranks
.
append
(
ranks
)
# pipeline parallel does not need custom allreduce
_PP
=
init_model_parallel_group
(
group_ranks
,
get_world_group
().
local_rank
,
backend
,
use_custom_allreduce
=
False
)
ps
.
_PP
=
_PP
# for verl
def
initialize_model_parallel
(
tensor_model_parallel_size
:
int
=
1
,
pipeline_model_parallel_size
:
int
=
1
,
backend
:
Optional
[
str
]
=
None
,
)
->
None
:
"""
NOTE: This method is a hack from the open-sourced version without
asertion of world_size = tp * pp
Initialize model parallel groups.
Arguments:
tensor_model_parallel_size: number of GPUs used for tensor model
parallelism.
pipeline_model_parallel_size: number of GPUs used for pipeline model
parallelism.
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
the model pipeline. The present function will
create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
4 tensor model-parallel groups:
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
2 pipeline model-parallel groups:
[g0, g2, g4, g6], [g1, g3, g5, g7]
Note that for efficiency, the caller should make sure adjacent ranks
are on the same DGX box. For example if we are using 2 DGX-1 boxes
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
ranks 8 to 15 belong to the second box.
"""
# Get world size and rank. Ensure some consistencies.
assert
torch
.
distributed
.
is_initialized
()
world_size
:
int
=
torch
.
distributed
.
get_world_size
()
backend
=
backend
or
torch
.
distributed
.
get_backend
(
ps
.
get_world_group
().
device_group
)
# NOTE(sgm) we don't assert world_size == tp * pp
# DP is not managed by vllm but by the VeRL WorkerGroup
# if (world_size !=
# tensor_model_parallel_size * pipeline_model_parallel_size):
# raise RuntimeError(
# f"world_size ({world_size}) is not equal to "
# f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
# f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
num_tensor_model_parallel_groups
:
int
=
world_size
//
tensor_model_parallel_size
global
_TP
assert
_TP
is
None
,
"tensor model parallel group is already initialized"
group_ranks
=
[]
for
i
in
range
(
num_tensor_model_parallel_groups
):
ranks
=
list
(
range
(
i
*
tensor_model_parallel_size
,
(
i
+
1
)
*
tensor_model_parallel_size
))
group_ranks
.
append
(
ranks
)
# message queue broadcaster is only used in tensor model parallel group
if
ps
.
_TP
is
not
None
:
_TP
=
ps
.
_TP
else
:
_TP
=
init_model_parallel_group
(
group_ranks
,
get_world_group
().
local_rank
,
backend
,
use_custom_allreduce
=
False
,
# TODO: check why True is not work in Ray trainer
use_message_queue_broadcaster
=
True
,
)
ps
.
_TP
=
_TP
# TODO: init using device mesh (not support hybrid engine now)
# Build the pipeline model-parallel groups.
num_pipeline_model_parallel_groups
:
int
=
world_size
//
pipeline_model_parallel_size
global
_PP
assert
_PP
is
None
,
"pipeline model parallel group is already initialized"
group_ranks
=
[]
for
i
in
range
(
num_pipeline_model_parallel_groups
):
ranks
=
list
(
range
(
i
,
world_size
,
num_pipeline_model_parallel_groups
))
group_ranks
.
append
(
ranks
)
# pipeline parallel does not need custom allreduce
if
ps
.
_TP
is
not
None
:
_PP
=
ps
.
_TP
else
:
_PP
=
init_model_parallel_group
(
group_ranks
,
get_world_group
().
local_rank
,
backend
,
use_custom_allreduce
=
False
)
ps
.
_PP
=
_PP
"""
Device mesh utilities
"""
def
get_device_mesh
():
assert
_DEVICE_MESH
is
not
None
,
"device mesh is not initialized"
return
_DEVICE_MESH
"""
Tensor model parallel utilities
"""
# NOTE(linjunrong): In the vllm version parallel_state.py. veRL created its own _TP and _PP as veRL want to use
# the process group for some extra purpose. Under the hood, there is no difference between them and the original
# one in vllm.distributed.parallel_state. However, the implementation need to hack the init process of inference
# engine, as we do not maintain another SGLang here, I just use the original _TP and _PP directly.
def
get_tensor_model_parallel_group
():
"""Get the tensor model parallel group the caller rank belongs to."""
assert
_TP
is
not
None
,
"tensor model parallel group is not initialized"
return
_TP
.
device_group
def
get_tensor_model_parallel_world_size
():
"""Return world size for the tensor model parallel group."""
return
torch
.
distributed
.
get_world_size
(
group
=
get_tensor_model_parallel_group
())
def
get_tensor_model_parallel_rank
():
"""Return my rank for the tensor model parallel group."""
return
torch
.
distributed
.
get_rank
(
group
=
get_tensor_model_parallel_group
())
def
get_tensor_model_parallel_src_rank
():
"""Calculate the global rank corresponding to the first local rank
in the tensor model parallel group."""
global_rank
=
torch
.
distributed
.
get_rank
()
local_world_size
=
get_tensor_model_parallel_world_size
()
return
(
global_rank
//
local_world_size
)
*
local_world_size
verl/third_party/vllm/__init__.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
importlib.metadata
import
version
,
PackageNotFoundError
from
packaging
import
version
as
vs
from
verl.utils.import_utils
import
is_sglang_available
def
get_version
(
pkg
):
try
:
return
version
(
pkg
)
except
PackageNotFoundError
:
return
None
package_name
=
'vllm'
package_version
=
get_version
(
package_name
)
vllm_version
=
None
if
package_version
==
'0.3.1'
:
vllm_version
=
'0.3.1'
from
.vllm_v_0_3_1.llm
import
LLM
from
.vllm_v_0_3_1.llm
import
LLMEngine
from
.vllm_v_0_3_1
import
parallel_state
elif
package_version
==
'0.4.2'
:
vllm_version
=
'0.4.2'
from
.vllm_v_0_4_2.llm
import
LLM
from
.vllm_v_0_4_2.llm
import
LLMEngine
from
.vllm_v_0_4_2
import
parallel_state
elif
package_version
==
'0.5.4'
:
vllm_version
=
'0.5.4'
from
.vllm_v_0_5_4.llm
import
LLM
from
.vllm_v_0_5_4.llm
import
LLMEngine
from
.vllm_v_0_5_4
import
parallel_state
elif
package_version
==
'0.6.3'
:
vllm_version
=
'0.6.3'
from
.vllm_v_0_6_3.llm
import
LLM
from
.vllm_v_0_6_3.llm
import
LLMEngine
from
.vllm_v_0_6_3
import
parallel_state
elif
package_version
==
'0.6.3+rocm624'
or
package_version
==
'0.6.2+das.opt3.dtk2504'
:
vllm_version
=
'0.6.3'
from
.vllm_v_0_6_3.llm
import
LLM
from
.vllm_v_0_6_3.llm
import
LLMEngine
from
.vllm_v_0_6_3
import
parallel_state
elif
vs
.
parse
(
package_version
)
>=
vs
.
parse
(
'0.7.0'
):
# From 0.6.6.post2 on, vllm supports SPMD inference
# See https://github.com/vllm-project/vllm/pull/12071
from
vllm
import
LLM
from
vllm.distributed
import
parallel_state
else
:
if
not
is_sglang_available
():
raise
ValueError
(
f
'vllm version
{
package_version
}
not supported and SGLang also not Found. Currently supported vllm versions are 0.3.1, 0.4.2, 0.5.4, 0.6.3 and 0.7.0+'
)
verl/third_party/vllm/vllm_v_0_3_1/__init__.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
import
argparse
import
dataclasses
from
dataclasses
import
dataclass
from
typing
import
Dict
,
Optional
,
Tuple
import
torch.nn
as
nn
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
LoRAConfig
)
from
transformers
import
PretrainedConfig
from
.config
import
ModelConfig
@
dataclass
class
EngineArgs
:
"""Arguments for vLLM engine."""
model_hf_config
:
PretrainedConfig
=
None
dtype
:
str
=
'auto'
kv_cache_dtype
:
str
=
'auto'
seed
:
int
=
0
max_model_len
:
Optional
[
int
]
=
None
worker_use_ray
:
bool
=
False
pipeline_parallel_size
:
int
=
1
tensor_parallel_size
:
int
=
1
max_parallel_loading_workers
:
Optional
[
int
]
=
None
block_size
:
int
=
16
swap_space
:
int
=
4
# GiB
gpu_memory_utilization
:
float
=
0.90
max_num_batched_tokens
:
Optional
[
int
]
=
None
max_num_seqs
:
int
=
256
max_paddings
:
int
=
256
disable_log_stats
:
bool
=
False
revision
:
Optional
[
str
]
=
None
tokenizer_revision
:
Optional
[
str
]
=
None
quantization
:
Optional
[
str
]
=
None
load_format
:
str
=
'model'
enforce_eager
:
bool
=
False
max_context_len_to_capture
:
int
=
8192
disable_custom_all_reduce
:
bool
=
False
enable_lora
:
bool
=
False
max_loras
:
int
=
1
max_lora_rank
:
int
=
16
lora_extra_vocab_size
:
int
=
256
lora_dtype
=
'auto'
max_cpu_loras
:
Optional
[
int
]
=
None
device
:
str
=
'cuda'
@
staticmethod
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
)
->
argparse
.
ArgumentParser
:
"""Shared CLI arguments for vLLM engine."""
# Model arguments
# TODO(shengguangming): delete the unused args
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
,
help
=
'name or path of the huggingface model to use'
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer
,
help
=
'name or path of the huggingface tokenizer to use'
)
parser
.
add_argument
(
'--revision'
,
type
=
str
,
default
=
None
,
help
=
'the specific model version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--tokenizer-revision'
,
type
=
str
,
default
=
None
,
help
=
'the specific tokenizer version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--tokenizer-mode'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer_mode
,
choices
=
[
'auto'
,
'slow'
],
help
=
'tokenizer mode. "auto" will use the fast '
'tokenizer if available, and "slow" will '
'always use the slow tokenizer.'
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
'--download-dir'
,
type
=
str
,
default
=
EngineArgs
.
download_dir
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of '
'huggingface'
)
parser
.
add_argument
(
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
],
help
=
'The format of the model weights to load. '
'"auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available. '
'"pt" will load the weights in the pytorch bin format. '
'"safetensors" will load the weights in the safetensors format. '
'"npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading. '
'"dummy" will initialize the weights with random values, '
'which is mainly for profiling.'
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
EngineArgs
.
dtype
,
choices
=
[
'auto'
,
'half'
,
'float16'
,
'bfloat16'
,
'float'
,
'float32'
],
help
=
'data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
default
=
None
,
help
=
'model context length. If unspecified, '
'will be automatically derived from the model.'
)
# Parallel arguments
parser
.
add_argument
(
'--worker-use-ray'
,
action
=
'store_true'
,
help
=
'use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU'
)
parser
.
add_argument
(
'--pipeline-parallel-size'
,
'-pp'
,
type
=
int
,
default
=
EngineArgs
.
pipeline_parallel_size
,
help
=
'number of pipeline stages'
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
EngineArgs
.
tensor_parallel_size
,
help
=
'number of tensor parallel replicas'
)
# KV cache arguments
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
],
help
=
'token block size'
)
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
EngineArgs
.
seed
,
help
=
'random seed'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
EngineArgs
.
swap_space
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
EngineArgs
.
gpu_memory_utilization
,
help
=
'the percentage of GPU memory to be used for'
'the model executor'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
EngineArgs
.
max_num_batched_tokens
,
help
=
'maximum number of batched tokens per '
'iteration'
)
parser
.
add_argument
(
'--max-num-seqs'
,
type
=
int
,
default
=
EngineArgs
.
max_num_seqs
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--disable-log-stats'
,
action
=
'store_true'
,
help
=
'disable logging statistics'
)
# Quantization settings.
parser
.
add_argument
(
'--quantization'
,
'-q'
,
type
=
str
,
choices
=
[
'awq'
,
None
],
default
=
None
,
help
=
'Method used to quantize the weights'
)
return
parser
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
)
->
'EngineArgs'
:
# Get the list of attributes of this dataclass.
attrs
=
[
attr
.
name
for
attr
in
dataclasses
.
fields
(
cls
)]
# Set the attributes from the parsed arguments.
engine_args
=
cls
(
**
{
attr
:
getattr
(
args
,
attr
)
for
attr
in
attrs
})
return
engine_args
def
create_engine_configs
(
self
,
)
->
Tuple
[
ModelConfig
,
CacheConfig
,
ParallelConfig
,
SchedulerConfig
]:
device_config
=
DeviceConfig
(
self
.
device
)
model_config
=
ModelConfig
(
self
.
model_hf_config
,
self
.
dtype
,
self
.
seed
,
self
.
load_format
,
self
.
revision
,
self
.
tokenizer_revision
,
self
.
max_model_len
,
self
.
quantization
,
self
.
enforce_eager
,
self
.
max_context_len_to_capture
)
cache_config
=
CacheConfig
(
self
.
block_size
,
self
.
gpu_memory_utilization
,
self
.
swap_space
,
self
.
kv_cache_dtype
,
model_config
.
get_sliding_window
())
parallel_config
=
ParallelConfig
(
self
.
pipeline_parallel_size
,
self
.
tensor_parallel_size
,
self
.
worker_use_ray
,
self
.
max_parallel_loading_workers
,
self
.
disable_custom_all_reduce
)
scheduler_config
=
SchedulerConfig
(
self
.
max_num_batched_tokens
,
self
.
max_num_seqs
,
model_config
.
max_model_len
,
self
.
max_paddings
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
self
.
max_lora_rank
,
max_loras
=
self
.
max_loras
,
lora_extra_vocab_size
=
self
.
lora_extra_vocab_size
,
lora_dtype
=
self
.
lora_dtype
,
max_cpu_loras
=
self
.
max_cpu_loras
if
self
.
max_cpu_loras
and
self
.
max_cpu_loras
>
0
else
None
)
if
self
.
enable_lora
else
None
return
(
model_config
,
cache_config
,
parallel_config
,
scheduler_config
,
device_config
,
lora_config
)
@
dataclass
class
AsyncEngineArgs
(
EngineArgs
):
"""Arguments for asynchronous vLLM engine."""
engine_use_ray
:
bool
=
False
disable_log_requests
:
bool
=
False
max_log_len
:
Optional
[
int
]
=
None
@
staticmethod
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
)
->
argparse
.
ArgumentParser
:
parser
=
EngineArgs
.
add_cli_args
(
parser
)
parser
.
add_argument
(
'--engine-use-ray'
,
action
=
'store_true'
,
help
=
'use Ray to start the LLM engine in a '
'separate process as the server process.'
)
parser
.
add_argument
(
'--disable-log-requests'
,
action
=
'store_true'
,
help
=
'disable logging requests'
)
parser
.
add_argument
(
'--max-log-len'
,
type
=
int
,
default
=
None
,
help
=
'max number of prompt characters or prompt '
'ID numbers being printed in log. '
'Default: unlimited.'
)
return
parser
verl/third_party/vllm/vllm_v_0_3_1/config.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
from
typing
import
Optional
,
Union
,
ClassVar
from
dataclasses
import
dataclass
import
torch
from
transformers
import
PretrainedConfig
from
packaging.version
import
Version
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.config
import
get_config
from
vllm.utils
import
get_cpu_memory
,
is_hip
,
get_nvcc_cuda_version
logger
=
init_logger
(
__name__
)
_GB
=
1
<<
30
class
ModelConfig
:
"""Configuration for the model.
Args:
model: Name or path of the huggingface model to use.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
dtype: Data type for model weights and activations. The "auto" option
will use FP16 precision for FP32 and FP16 models, and BF16 precision
for BF16 models.
seed: Random seed for reproducibility.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id. If unspecified, will use the default
version.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id. If unspecified, will use
the default version.
max_model_len: Maximum length of a sequence (including prompt and
output). If None, will be derived from the model.
quantization: Quantization method that was used to quantize the model
weights. If None, we assume the model weights are not quantized.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode.
"""
def
__init__
(
self
,
hf_config
:
PretrainedConfig
,
dtype
:
str
,
seed
:
int
,
load_format
:
str
=
'model'
,
revision
:
Optional
[
str
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
max_model_len
:
Optional
[
int
]
=
None
,
quantization
:
Optional
[
str
]
=
None
,
trust_remote_code
:
Optional
[
bool
]
=
True
,
enforce_eager
:
bool
=
False
,
max_context_len_to_capture
:
Optional
[
int
]
=
None
,
)
->
None
:
self
.
model
=
hf_config
.
_name_or_path
self
.
tokenizer
=
hf_config
.
_name_or_path
self
.
load_format
=
load_format
self
.
seed
=
seed
self
.
revision
=
revision
self
.
tokenizer_revision
=
tokenizer_revision
self
.
quantization
=
quantization
self
.
trust_remote_code
=
trust_remote_code
self
.
enforce_eager
=
enforce_eager
self
.
max_context_len_to_capture
=
max_context_len_to_capture
# self.hf_config = get_config(model, trust_remote_code, revision)
self
.
hf_config
=
hf_config
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_config
,
dtype
)
self
.
max_model_len
=
_get_and_verify_max_len
(
self
.
hf_config
,
max_model_len
)
# self._verify_load_format()
# self._verify_tokenizer_mode()
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
def
_verify_load_format
(
self
)
->
None
:
load_format
=
self
.
load_format
.
lower
()
if
load_format
not
in
[
"auto"
,
"pt"
,
"safetensors"
,
"npcache"
,
"dummy"
,
"model"
]:
raise
ValueError
(
f
"Unknown load format:
{
self
.
load_format
}
. Must be one of "
"'auto', 'pt', 'safetensors', 'npcache', 'dummy' or 'model'."
)
self
.
load_format
=
load_format
# def _verify_tokenizer_mode(self) -> None:
# tokenizer_mode = self.tokenizer_mode.lower()
# if tokenizer_mode not in ["auto", "slow"]:
# raise ValueError(
# f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
# "either 'auto' or 'slow'.")
# self.tokenizer_mode = tokenizer_mode
def
_verify_quantization
(
self
)
->
None
:
supported_quantization
=
[
"awq"
,
"gptq"
,
"squeezellm"
]
rocm_not_supported_quantization
=
[
"awq"
,
"gptq"
]
if
self
.
quantization
is
not
None
:
self
.
quantization
=
self
.
quantization
.
lower
()
# Parse quantization method from the HF model config, if available.
hf_quant_config
=
getattr
(
self
.
hf_config
,
"quantization_config"
,
None
)
if
hf_quant_config
is
not
None
:
hf_quant_method
=
str
(
hf_quant_config
[
"quant_method"
]).
lower
()
if
self
.
quantization
is
None
:
self
.
quantization
=
hf_quant_method
elif
self
.
quantization
!=
hf_quant_method
:
raise
ValueError
(
"Quantization method specified in the model config "
f
"(
{
hf_quant_method
}
) does not match the quantization "
f
"method specified in the `quantization` argument "
f
"(
{
self
.
quantization
}
)."
)
if
self
.
quantization
is
not
None
:
if
self
.
quantization
not
in
supported_quantization
:
raise
ValueError
(
f
"Unknown quantization method:
{
self
.
quantization
}
. Must "
f
"be one of
{
supported_quantization
}
."
)
if
is_hip
()
and
self
.
quantization
in
rocm_not_supported_quantization
:
raise
ValueError
(
f
"
{
self
.
quantization
}
quantization is currently not supported "
f
"in ROCm."
)
logger
.
warning
(
f
"
{
self
.
quantization
}
quantization is not fully "
"optimized yet. The speed can be slower than "
"non-quantized models."
)
def
_verify_cuda_graph
(
self
)
->
None
:
if
self
.
max_context_len_to_capture
is
None
:
self
.
max_context_len_to_capture
=
self
.
max_model_len
self
.
max_context_len_to_capture
=
min
(
self
.
max_context_len_to_capture
,
self
.
max_model_len
)
if
(
self
.
quantization
in
[
"gptq"
,
"squeezellm"
]
and
not
self
.
enforce_eager
):
# Related issue: https://github.com/vllm-project/vllm/issues/2147
logger
.
warning
(
f
"
{
self
.
quantization
}
does not support CUDA graph "
"yet. Disabling CUDA graph."
)
self
.
enforce_eager
=
True
def
verify_with_parallel_config
(
self
,
parallel_config
:
"ParallelConfig"
,
)
->
None
:
total_num_attention_heads
=
self
.
hf_config
.
num_attention_heads
tensor_parallel_size
=
parallel_config
.
tensor_parallel_size
if
total_num_attention_heads
%
tensor_parallel_size
!=
0
:
raise
ValueError
(
f
"Total number of attention heads (
{
total_num_attention_heads
}
)"
" must be divisible by tensor parallel size "
f
"(
{
tensor_parallel_size
}
)."
)
total_num_hidden_layers
=
self
.
hf_config
.
num_hidden_layers
pipeline_parallel_size
=
parallel_config
.
pipeline_parallel_size
if
total_num_hidden_layers
%
pipeline_parallel_size
!=
0
:
raise
ValueError
(
f
"Total number of hidden layers (
{
total_num_hidden_layers
}
) "
"must be divisible by pipeline parallel size "
f
"(
{
pipeline_parallel_size
}
)."
)
def
get_sliding_window
(
self
)
->
Optional
[
int
]:
return
getattr
(
self
.
hf_config
,
"sliding_window"
,
None
)
def
get_vocab_size
(
self
)
->
int
:
return
self
.
hf_config
.
vocab_size
def
get_hidden_size
(
self
)
->
int
:
return
self
.
hf_config
.
hidden_size
def
get_head_size
(
self
)
->
int
:
# FIXME(woosuk): This may not be true for all models.
return
self
.
hf_config
.
hidden_size
//
self
.
hf_config
.
num_attention_heads
def
get_total_num_kv_heads
(
self
)
->
int
:
"""Returns the total number of KV heads."""
# For GPTBigCode & Falcon:
# NOTE: for falcon, when new_decoder_architecture is True, the
# multi_query flag is ignored and we use n_head_kv for the number of
# KV heads.
falcon_model_types
=
[
"falcon"
,
"RefinedWeb"
,
"RefinedWebModel"
]
new_decoder_arch_falcon
=
(
self
.
hf_config
.
model_type
in
falcon_model_types
and
getattr
(
self
.
hf_config
,
"new_decoder_architecture"
,
False
))
if
not
new_decoder_arch_falcon
and
getattr
(
self
.
hf_config
,
"multi_query"
,
False
):
# Multi-query attention, only one KV head.
# Currently, tensor parallelism is not supported in this case.
return
1
attributes
=
[
# For Falcon:
"n_head_kv"
,
"num_kv_heads"
,
# For LLaMA-2:
"num_key_value_heads"
,
# For ChatGLM:
"multi_query_group_num"
,
]
for
attr
in
attributes
:
num_kv_heads
=
getattr
(
self
.
hf_config
,
attr
,
None
)
if
num_kv_heads
is
not
None
:
return
num_kv_heads
# For non-grouped-query attention models, the number of KV heads is
# equal to the number of attention heads.
return
self
.
hf_config
.
num_attention_heads
def
get_num_kv_heads
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
"""Returns the number of KV heads per GPU."""
total_num_kv_heads
=
self
.
get_total_num_kv_heads
()
# If tensor parallelism is used, we divide the number of KV heads by
# the tensor parallel size. We will replicate the KV heads in the
# case where the number of KV heads is smaller than the tensor
# parallel size so each GPU has at least one KV head.
return
max
(
1
,
total_num_kv_heads
//
parallel_config
.
tensor_parallel_size
)
def
get_num_layers
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
total_num_hidden_layers
=
self
.
hf_config
.
num_hidden_layers
return
total_num_hidden_layers
//
parallel_config
.
pipeline_parallel_size
class
CacheConfig
:
"""Configuration for the KV cache.
Args:
block_size: Size of a cache block in number of tokens.
gpu_memory_utilization: Fraction of GPU memory to use for the
vLLM execution.
swap_space: Size of the CPU swap space per GPU (in GiB).
cache_dtype: Data type for kv cache storage.
"""
def
__init__
(
self
,
block_size
:
int
,
gpu_memory_utilization
:
float
,
swap_space
:
int
,
cache_dtype
:
str
,
sliding_window
:
Optional
[
int
]
=
None
,
)
->
None
:
self
.
block_size
=
block_size
self
.
gpu_memory_utilization
=
gpu_memory_utilization
self
.
swap_space_bytes
=
swap_space
*
_GB
self
.
cache_dtype
=
cache_dtype
self
.
sliding_window
=
sliding_window
self
.
_verify_args
()
self
.
_verify_cache_dtype
()
# Will be set after profiling.
self
.
num_gpu_blocks
=
None
self
.
num_cpu_blocks
=
None
def
_verify_args
(
self
)
->
None
:
if
self
.
gpu_memory_utilization
>
1.0
:
raise
ValueError
(
"GPU memory utilization must be less than 1.0. Got "
f
"
{
self
.
gpu_memory_utilization
}
."
)
def
_verify_cache_dtype
(
self
)
->
None
:
if
self
.
cache_dtype
==
"auto"
:
pass
elif
self
.
cache_dtype
==
"fp8_e5m2"
:
nvcc_cuda_version
=
get_nvcc_cuda_version
()
if
nvcc_cuda_version
<
Version
(
"11.8"
):
raise
ValueError
(
"FP8 is not supported when cuda version is lower than 11.8."
)
device_name
=
torch
.
cuda
.
get_device_name
()
if
"AMD"
in
device_name
:
raise
NotImplementedError
(
"FP8_E5M2 KV Cache on AMD GPU has not been supported yet."
)
logger
.
info
(
"Using fp8_e5m2 data type to store kv cache. It reduces "
"the GPU memory footprint and boosts the performance. "
"But it may cause slight accuracy drop. "
"Currently we only support fp8 without scaling factors and "
"make e5m2 as a default format."
)
else
:
raise
ValueError
(
f
"Unknown kv cache dtype:
{
self
.
cache_dtype
}
"
)
def
verify_with_parallel_config
(
self
,
parallel_config
:
"ParallelConfig"
,
)
->
None
:
total_cpu_memory
=
get_cpu_memory
()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node
=
parallel_config
.
tensor_parallel_size
cpu_memory_usage
=
self
.
swap_space_bytes
*
num_gpus_per_node
msg
=
(
f
"
{
cpu_memory_usage
/
_GB
:.
2
f
}
GiB out of "
f
"the
{
total_cpu_memory
/
_GB
:.
2
f
}
GiB total CPU memory is "
"allocated for the swap space."
)
if
cpu_memory_usage
>
0.7
*
total_cpu_memory
:
raise
ValueError
(
"Too large swap space. "
+
msg
)
elif
cpu_memory_usage
>
0.4
*
total_cpu_memory
:
logger
.
warning
(
"Possibly too large swap space. "
+
msg
)
class
ParallelConfig
:
"""Configuration for the distributed execution.
Args:
pipeline_parallel_size: Number of pipeline parallel groups.
tensor_parallel_size: Number of tensor parallel groups.
worker_use_ray: Whether to use Ray for model workers. Will be set to
True if either pipeline_parallel_size or tensor_parallel_size is
greater than 1.
max_parallel_loading_workers: Maximum number of multiple batches
when load model sequentially. To avoid RAM OOM when using tensor
parallel and large models.
disable_custom_all_reduce: Disable the custom all-reduce kernel and
fall back to NCCL.
"""
def
__init__
(
self
,
pipeline_parallel_size
:
int
,
tensor_parallel_size
:
int
,
worker_use_ray
:
bool
,
max_parallel_loading_workers
:
Optional
[
int
]
=
None
,
disable_custom_all_reduce
:
bool
=
False
,
)
->
None
:
self
.
pipeline_parallel_size
=
pipeline_parallel_size
self
.
tensor_parallel_size
=
tensor_parallel_size
self
.
worker_use_ray
=
worker_use_ray
self
.
max_parallel_loading_workers
=
max_parallel_loading_workers
self
.
disable_custom_all_reduce
=
disable_custom_all_reduce
self
.
world_size
=
pipeline_parallel_size
*
tensor_parallel_size
if
self
.
world_size
>
1
:
self
.
worker_use_ray
=
True
self
.
_verify_args
()
def
_verify_args
(
self
)
->
None
:
if
self
.
pipeline_parallel_size
>
1
:
raise
NotImplementedError
(
"Pipeline parallelism is not supported yet."
)
if
not
self
.
disable_custom_all_reduce
and
self
.
world_size
>
1
:
if
is_hip
():
self
.
disable_custom_all_reduce
=
True
logger
.
info
(
"Disabled the custom all-reduce kernel because it is not "
"supported on AMD GPUs."
)
elif
self
.
pipeline_parallel_size
>
1
:
self
.
disable_custom_all_reduce
=
True
logger
.
info
(
"Disabled the custom all-reduce kernel because it is not "
"supported with pipeline parallelism."
)
# FIXME(woosuk): Fix the stability issues and re-enable the custom
# all-reduce kernel.
if
not
self
.
disable_custom_all_reduce
and
self
.
world_size
>
1
:
self
.
disable_custom_all_reduce
=
True
logger
.
info
(
"Custom all-reduce kernels are temporarily disabled due to "
"stability issues. We will re-enable them once the issues are "
"resolved."
)
class
SchedulerConfig
:
"""Scheduler configuration.
Args:
max_num_batched_tokens: Maximum number of tokens to be processed in
a single iteration.
max_num_seqs: Maximum number of sequences to be processed in a single
iteration.
max_model_len: Maximum length of a sequence (including prompt
and generated text).
max_paddings: Maximum number of paddings to be added to a batch.
"""
def
__init__
(
self
,
max_num_batched_tokens
:
Optional
[
int
],
max_num_seqs
:
int
,
max_model_len
:
int
,
max_paddings
:
int
,
)
->
None
:
if
max_num_batched_tokens
is
not
None
:
self
.
max_num_batched_tokens
=
max_num_batched_tokens
else
:
# If max_model_len is too short, use 2048 as the default value for
# higher throughput.
self
.
max_num_batched_tokens
=
max
(
max_model_len
,
2048
)
self
.
max_num_seqs
=
max_num_seqs
self
.
max_model_len
=
max_model_len
self
.
max_paddings
=
max_paddings
self
.
_verify_args
()
def
_verify_args
(
self
)
->
None
:
if
self
.
max_num_batched_tokens
<
self
.
max_model_len
:
raise
ValueError
(
f
"max_num_batched_tokens (
{
self
.
max_num_batched_tokens
}
) is "
f
"smaller than max_model_len (
{
self
.
max_model_len
}
). "
"This effectively limits the maximum sequence length to "
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len."
)
if
self
.
max_num_batched_tokens
<
self
.
max_num_seqs
:
raise
ValueError
(
f
"max_num_batched_tokens (
{
self
.
max_num_batched_tokens
}
) must "
"be greater than or equal to max_num_seqs "
f
"(
{
self
.
max_num_seqs
}
)."
)
class
DeviceConfig
:
def
__init__
(
self
,
device
:
str
=
"cuda"
)
->
None
:
self
.
device
=
torch
.
device
(
device
)
@
dataclass
class
LoRAConfig
:
max_lora_rank
:
int
max_loras
:
int
max_cpu_loras
:
Optional
[
int
]
=
None
lora_dtype
:
Optional
[
torch
.
dtype
]
=
None
lora_extra_vocab_size
:
int
=
256
# This is a constant.
lora_vocab_padding_size
:
ClassVar
[
int
]
=
256
def
__post_init__
(
self
):
# Keep this in sync with csrc/punica/bgmv/bgmv_config.h
possible_max_ranks
=
(
8
,
16
,
32
,
64
)
possible_lora_extra_vocab_size
=
(
0
,
256
,
512
)
if
self
.
max_lora_rank
not
in
possible_max_ranks
:
raise
ValueError
(
f
"max_lora_rank (
{
self
.
max_lora_rank
}
) must be one of "
f
"
{
possible_max_ranks
}
."
)
if
self
.
lora_extra_vocab_size
not
in
possible_lora_extra_vocab_size
:
raise
ValueError
(
f
"lora_extra_vocab_size (
{
self
.
lora_extra_vocab_size
}
) "
f
"must be one of
{
possible_lora_extra_vocab_size
}
."
)
if
self
.
max_loras
<
1
:
raise
ValueError
(
f
"max_loras (
{
self
.
max_loras
}
) must be >= 1."
)
if
self
.
max_cpu_loras
is
None
:
self
.
max_cpu_loras
=
self
.
max_loras
elif
self
.
max_cpu_loras
<
self
.
max_loras
:
raise
ValueError
(
f
"max_cpu_loras (
{
self
.
max_cpu_loras
}
) must be >= "
f
"max_loras (
{
self
.
max_loras
}
)"
)
def
verify_with_model_config
(
self
,
model_config
:
ModelConfig
):
if
self
.
lora_dtype
in
(
None
,
"auto"
):
self
.
lora_dtype
=
model_config
.
dtype
elif
isinstance
(
self
.
lora_dtype
,
str
):
self
.
lora_dtype
=
getattr
(
torch
,
self
.
lora_dtype
)
if
model_config
.
quantization
is
not
None
:
raise
ValueError
(
"LoRA is not supported with quantized models yet."
)
def
verify_with_scheduler_config
(
self
,
scheduler_config
:
SchedulerConfig
):
if
scheduler_config
.
max_num_batched_tokens
>
65528
:
raise
ValueError
(
"Due to limitations of the custom LoRA CUDA kernel, "
"max_num_batched_tokens must be <= 65528 when "
"LoRA is enabled."
)
_STR_DTYPE_TO_TORCH_DTYPE
=
{
"half"
:
torch
.
float16
,
"float16"
:
torch
.
float16
,
"float"
:
torch
.
float32
,
"float32"
:
torch
.
float32
,
"bfloat16"
:
torch
.
bfloat16
,
}
_ROCM_NOT_SUPPORTED_DTYPE
=
[
"float"
,
"float32"
]
def
_get_and_verify_dtype
(
config
:
PretrainedConfig
,
dtype
:
Union
[
str
,
torch
.
dtype
],
)
->
torch
.
dtype
:
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
# because config.torch_dtype can be None.
config_dtype
=
getattr
(
config
,
"torch_dtype"
,
None
)
if
config_dtype
is
None
:
config_dtype
=
torch
.
float32
if
isinstance
(
dtype
,
str
):
dtype
=
dtype
.
lower
()
if
dtype
==
"auto"
:
if
config_dtype
==
torch
.
float32
:
# Following the common practice, we use float16 for float32
# models.
torch_dtype
=
torch
.
float16
else
:
torch_dtype
=
config_dtype
else
:
if
dtype
not
in
_STR_DTYPE_TO_TORCH_DTYPE
:
raise
ValueError
(
f
"Unknown dtype:
{
dtype
}
"
)
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
elif
isinstance
(
dtype
,
torch
.
dtype
):
torch_dtype
=
dtype
else
:
raise
ValueError
(
f
"Unknown dtype:
{
dtype
}
"
)
if
is_hip
()
and
torch_dtype
==
torch
.
float32
:
rocm_supported_dtypes
=
[
k
for
k
,
v
in
_STR_DTYPE_TO_TORCH_DTYPE
.
items
()
if
(
k
not
in
_ROCM_NOT_SUPPORTED_DTYPE
)
]
raise
ValueError
(
f
"dtype
\'
{
dtype
}
\'
is not supported in ROCm. "
f
"Supported dtypes are
{
rocm_supported_dtypes
}
"
)
# Verify the dtype.
if
torch_dtype
!=
config_dtype
:
if
torch_dtype
==
torch
.
float32
:
# Upcasting to float32 is allowed.
pass
elif
config_dtype
==
torch
.
float32
:
# Downcasting from float32 to float16 or bfloat16 is allowed.
pass
else
:
# Casting between float16 and bfloat16 is allowed with a warning.
logger
.
warning
(
f
"Casting
{
config_dtype
}
to
{
torch_dtype
}
."
)
return
torch_dtype
def
_get_and_verify_max_len
(
hf_config
:
PretrainedConfig
,
max_model_len
:
Optional
[
int
],
)
->
int
:
"""Get and verify the model's maximum length."""
derived_max_model_len
=
float
(
"inf"
)
possible_keys
=
[
# OPT
"max_position_embeddings"
,
# GPT-2
"n_positions"
,
# MPT
"max_seq_len"
,
# ChatGLM2
"seq_length"
,
# Others
"max_sequence_length"
,
"max_seq_length"
,
"seq_len"
,
]
for
key
in
possible_keys
:
max_len_key
=
getattr
(
hf_config
,
key
,
None
)
if
max_len_key
is
not
None
:
derived_max_model_len
=
min
(
derived_max_model_len
,
max_len_key
)
if
derived_max_model_len
==
float
(
"inf"
):
if
max_model_len
is
not
None
:
# If max_model_len is specified, we use it.
return
max_model_len
default_max_len
=
2048
logger
.
warning
(
"The model's config.json does not contain any of the following "
"keys to determine the original maximum length of the model: "
f
"
{
possible_keys
}
. Assuming the model's maximum length is "
f
"
{
default_max_len
}
."
)
derived_max_model_len
=
default_max_len
rope_scaling
=
getattr
(
hf_config
,
"rope_scaling"
,
None
)
if
rope_scaling
is
not
None
:
assert
"factor"
in
rope_scaling
scaling_factor
=
rope_scaling
[
"factor"
]
if
rope_scaling
[
"type"
]
==
"yarn"
:
derived_max_model_len
=
rope_scaling
[
"original_max_position_embeddings"
]
derived_max_model_len
*=
scaling_factor
if
max_model_len
is
None
:
max_model_len
=
derived_max_model_len
elif
max_model_len
>
derived_max_model_len
:
raise
ValueError
(
f
"User-specified max_model_len (
{
max_model_len
}
) is greater than "
f
"the derived max_model_len (
{
max_len_key
}
=
{
derived_max_model_len
}
"
" in model's config.json). This may lead to incorrect model "
"outputs or CUDA errors. Make sure the value is correct and "
"within the model context size."
)
return
int
(
max_model_len
)
verl/third_party/vllm/vllm_v_0_3_1/llm.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
tqdm
import
tqdm
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
transformers
import
PretrainedConfig
import
torch.nn
as
nn
from
.arg_utils
import
EngineArgs
from
.llm_engine_sp
import
LLMEngine
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
Counter
import
torch
from
torch.nn.utils.rnn
import
pad_sequence
from
verl.workers.rollout.tokenizer
import
HybridEngineBaseTokenizer
class
LLM
:
"""An LLM for generating texts from given prompts and sampling parameters.
This class includes a tokenizer, a language model (possibly distributed
across multiple GPUs), and GPU memory space allocated for intermediate
states (aka KV cache). Given a batch of prompts and sampling parameters,
this class generates texts from the model, using an intelligent batching
mechanism and efficient memory management.
NOTE: This class is intended to be used for offline inference. For online
serving, use the `AsyncLLMEngine` class instead.
NOTE: For the comprehensive list of arguments, see `EngineArgs`.
Args:
model: A HuggingFace Transformers model instance.
tokenizer: A HuggingFace Transformers tokenizer instance.
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
if available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
tensor_parallel_size: The number of GPUs to use for distributed
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
the `torch_dtype` attribute specified in the model config file.
However, if the `torch_dtype` in the config is `float32`, we will
use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
we support "awq". If None, we assume the model weights are not
quantized and use `dtype` to determine the data type of the weights.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id.
seed: The seed to initialize the random number generator for sampling.
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
reserve for the model weights, activations, and KV cache. Higher
values will increase the KV cache size and thus improve the model's
throughput. However, if the value is too high, it may cause out-of-
memory (OOM) errors.
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Otherwise, too small values may cause out-of-memory (OOM) errors.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode.
disable_custom_all_reduce: See ParallelConfig
"""
def
__init__
(
self
,
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
,
HybridEngineBaseTokenizer
],
model_hf_config
:
PretrainedConfig
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tensor_parallel_size
:
int
=
1
,
dtype
:
str
=
"auto"
,
quantization
:
Optional
[
str
]
=
None
,
revision
:
Optional
[
str
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
seed
:
int
=
0
,
gpu_memory_utilization
:
float
=
0.9
,
swap_space
:
int
=
4
,
enforce_eager
:
bool
=
False
,
max_context_len_to_capture
:
int
=
8192
,
disable_custom_all_reduce
:
bool
=
False
,
**
kwargs
,
)
->
None
:
if
"disable_log_stats"
not
in
kwargs
:
kwargs
[
"disable_log_stats"
]
=
True
engine_args
=
EngineArgs
(
model_hf_config
=
model_hf_config
,
tensor_parallel_size
=
tensor_parallel_size
,
dtype
=
dtype
,
quantization
=
quantization
,
revision
=
revision
,
tokenizer_revision
=
tokenizer_revision
,
seed
=
seed
,
gpu_memory_utilization
=
gpu_memory_utilization
,
swap_space
=
swap_space
,
enforce_eager
=
enforce_eager
,
max_context_len_to_capture
=
max_context_len_to_capture
,
disable_custom_all_reduce
=
disable_custom_all_reduce
,
**
kwargs
,
)
tokenizer_cls
=
(
PreTrainedTokenizer
,
PreTrainedTokenizerFast
,
HybridEngineBaseTokenizer
)
if
not
isinstance
(
tokenizer
,
tokenizer_cls
):
raise
ValueError
(
f
"Unexpected tokenizer type:
{
type
(
tokenizer
)
}
. Must be"
"one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
)
self
.
llm_engine
=
LLMEngine
.
from_engine_args
(
model
,
tokenizer
,
engine_args
)
self
.
request_counter
=
Counter
()
def
init_cache_engine
(
self
):
self
.
llm_engine
.
init_cache_engine
()
def
free_cache_engine
(
self
):
self
.
llm_engine
.
free_cache_engine
()
def
get_tokenizer
(
self
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
return
self
.
llm_engine
.
tokenizer
def
set_tokenizer
(
self
,
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
)
->
None
:
self
.
llm_engine
.
tokenizer
=
tokenizer
def
generate
(
self
,
prompts
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
sampling_params
:
Optional
[
SamplingParams
]
=
None
,
prompt_token_ids
:
Optional
[
List
[
List
[
int
]]]
=
None
,
prefix_pos
:
Optional
[
Union
[
int
,
List
[
int
]]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
)
->
List
[
RequestOutput
]:
"""Generates the completions for the input prompts.
NOTE: This class automatically batches the given prompts, considering
the memory constraint. For the best performance, put all of your prompts
into a single list and pass it to this method.
Args:
prompts: A list of prompts to generate completions for.
sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters.
prompt_token_ids: A list of token IDs for the prompts. If None, we
use the tokenizer to convert the prompts to token IDs.
use_tqdm: Whether to use tqdm to display the progress bar.
Returns:
A list of `RequestOutput` objects containing the generated
completions in the same order as the input prompts.
"""
if
prompts
is
None
and
prompt_token_ids
is
None
:
raise
ValueError
(
"Either prompts or prompt_token_ids must be "
"provided."
)
if
isinstance
(
prompts
,
str
):
# Convert a single prompt to a list.
prompts
=
[
prompts
]
if
prompts
is
not
None
and
prompt_token_ids
is
not
None
:
if
len
(
prompts
)
!=
len
(
prompt_token_ids
):
raise
ValueError
(
"The lengths of prompts and prompt_token_ids "
"must be the same."
)
if
sampling_params
is
None
:
# Use default sampling params.
sampling_params
=
SamplingParams
()
# Add requests to the engine.
num_requests
=
len
(
prompts
)
if
prompts
is
not
None
else
len
(
prompt_token_ids
)
for
i
in
range
(
num_requests
):
prompt
=
prompts
[
i
]
if
prompts
is
not
None
else
None
prefix_pos_i
=
prefix_pos
[
i
]
if
prefix_pos
is
not
None
else
None
token_ids
=
None
if
prompt_token_ids
is
None
else
prompt_token_ids
[
i
]
if
not
isinstance
(
token_ids
,
list
):
# NOTE(shengguangming): convert the rollout input into List[str]
token_ids
=
self
.
_pre_process_inputs
(
token_ids
)
self
.
_add_request
(
prompt
,
sampling_params
,
token_ids
,
lora_request
=
lora_request
,
prefix_pos
=
prefix_pos_i
)
return
self
.
_run_engine
(
use_tqdm
)
def
_add_request
(
self
,
prompt
:
Optional
[
str
],
sampling_params
:
SamplingParams
,
prompt_token_ids
:
Optional
[
List
[
int
]],
lora_request
:
Optional
[
LoRARequest
]
=
None
,
prefix_pos
:
Optional
[
int
]
=
None
,
)
->
None
:
request_id
=
str
(
next
(
self
.
request_counter
))
self
.
llm_engine
.
add_request
(
request_id
,
prompt
,
sampling_params
,
prompt_token_ids
,
lora_request
=
lora_request
,
prefix_pos
=
prefix_pos
)
def
_run_engine
(
self
,
use_tqdm
:
bool
)
->
List
[
RequestOutput
]:
# Initialize tqdm.
if
use_tqdm
:
num_requests
=
self
.
llm_engine
.
get_num_unfinished_requests
()
pbar
=
tqdm
(
total
=
num_requests
,
desc
=
"Processed prompts"
)
# Run the engine.
outputs
:
List
[
RequestOutput
]
=
[]
while
self
.
llm_engine
.
has_unfinished_requests
():
step_outputs
=
self
.
llm_engine
.
step
()
for
output
in
step_outputs
:
if
output
.
finished
:
outputs
.
append
(
output
)
if
use_tqdm
:
pbar
.
update
(
1
)
if
use_tqdm
:
pbar
.
close
()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
outputs
=
sorted
(
outputs
,
key
=
lambda
x
:
int
(
x
.
request_id
))
# TODO(shengguangming): maybe we can hack the autoregressive logics without only apply post process for better performance
return
self
.
_post_process_outputs
(
outputs
)
# NOTE(shengguangming): add for verl
# TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
def
_pre_process_inputs
(
self
,
prompt_token_ids
:
torch
.
Tensor
)
->
List
[
int
]:
# remove the left padding in the prompt token_id
pad_token_id
=
self
.
llm_engine
.
tokenizer
.
pad_token_id
if
self
.
llm_engine
.
tokenizer
.
pad_token_id
is
not
None
else
self
.
llm_engine
.
tokenizer
.
eos_token_id
non_pad_index
=
torch
.
nonzero
(
prompt_token_ids
!=
pad_token_id
,
as_tuple
=
False
)[
0
][
0
]
token_ids
=
prompt_token_ids
[
non_pad_index
:].
tolist
()
return
token_ids
# NOTE(shengguangming): add for verl
def
_post_process_outputs
(
self
,
outputs
:
List
[
RequestOutput
])
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
output_token_ids
=
[]
logprobs
=
[]
for
output
in
outputs
:
# List[RequestOutput]
output
=
output
.
outputs
for
output
in
output
:
# List[CompletionOutput], usually len == 1
output_token_ids
.
append
(
torch
.
tensor
(
output
.
token_ids
))
# TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
logprobs_dicts
=
output
.
logprobs
if
logprobs_dicts
is
not
None
:
logprob
=
[]
for
logprobs_dict
,
id
in
zip
(
logprobs_dicts
,
output
.
token_ids
):
logprob
.
append
(
logprobs_dict
[
id
])
logprobs
.
append
(
torch
.
tensor
(
logprob
))
pad_token_id
=
self
.
llm_engine
.
tokenizer
.
pad_token_id
if
self
.
llm_engine
.
tokenizer
.
pad_token_id
is
not
None
else
self
.
llm_engine
.
tokenizer
.
eos_token_id
output_token_ids
=
pad_sequence
(
output_token_ids
,
batch_first
=
True
,
padding_value
=
pad_token_id
)
if
len
(
logprobs
)
>
0
:
logprobs
=
pad_sequence
(
logprobs
,
batch_first
=
True
,
padding_value
=
pad_token_id
)
return
output_token_ids
,
logprobs
def
sync_model_weights
(
self
,
actor_weights
:
Dict
[
str
,
torch
.
Tensor
])
->
None
:
self
.
llm_engine
.
sync_model_weights
(
actor_weights
=
actor_weights
)
def
offload_model_weights
(
self
)
->
None
:
self
.
llm_engine
.
offload_model_weights
()
verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
import
os
import
socket
import
time
import
torch
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
vllm.lora.request
import
LoRARequest
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
LoRAConfig
)
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
SamplerOutput
,
Sequence
,
SequenceGroup
,
SequenceGroupMetadata
,
SequenceGroupOutput
,
SequenceOutput
,
SequenceStatus
)
from
vllm.transformers_utils.tokenizer
import
detokenize_incrementally
from
vllm.engine.metrics
import
StatLogger
,
Stats
from
vllm.utils
import
Counter
import
torch.nn
as
nn
from
.arg_utils
import
EngineArgs
from
.tokenizer
import
TokenizerGroup
logger
=
init_logger
(
__name__
)
_LOCAL_LOGGING_INTERVAL_SEC
=
5
class
LLMEngine
:
"""An LLM engine that receives requests and generates texts.
This is the main class for the vLLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes
iteration-level scheduling and efficient memory management to maximize the
serving throughput.
The `LLM` class wraps this class for offline batched inference and the
`AsyncLLMEngine` class wraps this class for online serving.
NOTE: The config arguments are derived from the `EngineArgs` class. For the
comprehensive list of arguments, see `EngineArgs`.
Args:
model_config: The configuration related to the LLM model.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
distributed_init_method: The initialization method for distributed
execution. See `torch.distributed.init_process_group` for details.
placement_group: Ray placement group for distributed execution.
Required for distributed execution.
log_stats: Whether to log statistics.
"""
def
__init__
(
self
,
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
tokenizer
:
nn
.
Module
,
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
distributed_init_method
:
str
,
placement_group
:
Optional
[
None
],
log_stats
:
bool
,
)
->
None
:
logger
.
info
(
"Initializing an LLM engine with config: "
f
"model=
{
model_config
.
model
!
r
}
, "
f
"tokenizer=
{
model_config
.
tokenizer
!
r
}
, "
# f"tokenizer_mode={model_config.tokenizer_mode}, "
f
"revision=
{
model_config
.
revision
}
, "
f
"tokenizer_revision=
{
model_config
.
tokenizer_revision
}
, "
# f"trust_remote_code={model_config.trust_remote_code}, "
f
"dtype=
{
model_config
.
dtype
}
, "
f
"max_seq_len=
{
model_config
.
max_model_len
}
, "
# f"download_dir={model_config.download_dir!r}, "
# f"load_format={model_config.load_format}, "
f
"disable_custom_all_reduce=
{
parallel_config
.
disable_custom_all_reduce
}
, "
f
"tensor_parallel_size=
{
parallel_config
.
tensor_parallel_size
}
, "
f
"quantization=
{
model_config
.
quantization
}
, "
f
"seed=
{
model_config
.
seed
}
)"
)
# TODO(woosuk): Print more configs in debug mode.
self
.
model_config
=
model_config
# TODO: currently is hfconfig
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
assert
self
.
cache_config
.
sliding_window
==
getattr
(
self
.
model_config
.
hf_config
,
"sliding_window"
,
None
)
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
log_stats
=
log_stats
self
.
_verify_args
()
# self.model = model # should not store the model, it should be deleted
# TODO(shengguangming): maybe we can choose init here or from arguments
self
.
_init_tokenizer
(
tokenizer
)
self
.
seq_counter
=
Counter
()
# Create the parallel GPU workers.
self
.
_init_workers_sp
(
model
,
distributed_init_method
)
# Profile the memory usage and initialize the cache.
self
.
_init_cache_sp
()
# Create the scheduler.
# NOTE(shengguangming): each process will have independent scheduler
self
.
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
# Metric Logging.
if
self
.
log_stats
:
self
.
stat_logger
=
StatLogger
(
local_interval
=
_LOCAL_LOGGING_INTERVAL_SEC
)
# Logging.
self
.
last_logging_time
=
0.0
# List of (timestamp, num_tokens)
self
.
num_prompt_tokens
:
List
[
Tuple
[
float
,
int
]]
=
[]
# List of (timestamp, num_tokens)
self
.
num_generation_tokens
:
List
[
Tuple
[
float
,
int
]]
=
[]
def
_init_tokenizer
(
self
,
tokenizer
,
**
tokenizer_init_kwargs
):
init_kwargs
=
dict
(
enable_lora
=
bool
(
self
.
lora_config
),
max_num_seqs
=
self
.
scheduler_config
.
max_num_seqs
,
max_input_length
=
None
)
init_kwargs
.
update
(
tokenizer_init_kwargs
)
self
.
tokenizer
:
TokenizerGroup
=
TokenizerGroup
(
tokenizer
,
**
init_kwargs
)
# TODO: check get_lora_tokenizer func
def
get_tokenizer_for_seq
(
self
,
sequence
:
Sequence
):
return
self
.
tokenizer
.
get_lora_tokenizer
(
sequence
.
lora_request
)
def
_init_workers_sp
(
self
,
model
,
distributed_init_method
:
str
):
# Lazy import the Worker to avoid importing torch.cuda/xformers
# before CUDA_VISIBLE_DEVICES is set in the Worker
from
.worker
import
Worker
# pylint: disable=import-outside-toplevel
rank
=
int
(
os
.
getenv
(
"RANK"
))
self
.
worker
=
Worker
(
model
,
self
.
model_config
,
self
.
parallel_config
,
self
.
scheduler_config
,
self
.
device_config
,
rank
,
distributed_init_method
,
lora_config
=
self
.
lora_config
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
)
# NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
self
.
worker
.
init_model
()
self
.
worker
.
load_model
()
def
_verify_args
(
self
)
->
None
:
self
.
model_config
.
verify_with_parallel_config
(
self
.
parallel_config
)
self
.
cache_config
.
verify_with_parallel_config
(
self
.
parallel_config
)
def
_init_cache_sp
(
self
)
->
None
:
"""Profiles the memory usage and initializes the KV cache."""
# Get the maximum number of blocks that can be allocated on GPU and CPU.
num_blocks
=
self
.
worker
.
profile_num_available_blocks
(
block_size
=
self
.
cache_config
.
block_size
,
gpu_memory_utilization
=
self
.
cache_config
.
gpu_memory_utilization
,
cpu_swap_space
=
self
.
cache_config
.
swap_space_bytes
,
cache_dtype
=
self
.
cache_config
.
cache_dtype
,
)
# NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
# have its own scheduler
num_gpu_blocks
=
num_blocks
[
0
]
num_cpu_blocks
=
num_blocks
[
1
]
# FIXME(woosuk): Change to debug log.
logger
.
info
(
f
"# GPU blocks:
{
num_gpu_blocks
}
, "
f
"# CPU blocks:
{
num_cpu_blocks
}
"
)
if
num_gpu_blocks
<=
0
:
raise
ValueError
(
"No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine."
)
max_seq_len
=
self
.
cache_config
.
block_size
*
num_gpu_blocks
if
self
.
model_config
.
max_model_len
>
max_seq_len
:
raise
ValueError
(
f
"The model's max seq len (
{
self
.
model_config
.
max_model_len
}
) "
"is larger than the maximum number of tokens that can be "
f
"stored in KV cache (
{
max_seq_len
}
). Try increasing "
"`gpu_memory_utilization` or decreasing `max_model_len` when "
"initializing the engine."
)
self
.
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
self
.
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
# Initialize the cache.
self
.
worker
.
init_cache_engine
(
cache_config
=
self
.
cache_config
)
self
.
worker
.
warm_up_model
()
def
init_cache_engine
(
self
):
self
.
worker
.
init_cache_engine
(
cache_config
=
self
.
cache_config
)
def
free_cache_engine
(
self
):
self
.
worker
.
free_cache_engine
()
@
classmethod
def
from_engine_args
(
cls
,
model
,
tokenizer
,
engine_args
:
EngineArgs
)
->
"LLMEngine"
:
"""Creates an LLM engine from the engine arguments."""
# Create the engine configs.
engine_configs
=
engine_args
.
create_engine_configs
()
parallel_config
=
engine_configs
[
2
]
# Initialize the cluster.
distributed_init_method
,
placement_group
=
initialize_cluster
(
parallel_config
)
# Create the LLM engine.
engine
=
cls
(
model
,
tokenizer
,
*
engine_configs
,
distributed_init_method
,
placement_group
,
log_stats
=
not
engine_args
.
disable_log_stats
)
return
engine
def
add_request
(
self
,
request_id
:
str
,
prompt
:
Optional
[
str
],
sampling_params
:
SamplingParams
,
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
,
arrival_time
:
Optional
[
float
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
prefix_pos
:
Optional
[
int
]
=
None
,
)
->
None
:
"""Add a request to the engine's request pool.
The request is added to the request pool and will be processed by the
scheduler as `engine.step()` is called. The exact scheduling policy is
determined by the scheduler.
Args:
request_id: The unique ID of the request.
prompt: The prompt string. Can be None if prompt_token_ids is
provided.
sampling_params: The sampling parameters for text generation.
prompt_token_ids: The token IDs of the prompt. If None, we
use the tokenizer to convert the prompts to token IDs.
arrival_time: The arrival time of the request. If None, we use
the current monotonic time.
prefix_pos: If not None, we use the given position as the prefix
position for each prompt. We will cache the prefix's KV
cache and reuse it for the next request with the same prefix.
This is an experimental feature, and may be replaced with
automatic prefix caching in the future.
Details:
- Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None.
- Create `best_of` number of :class:`~vllm.Sequence` objects.
- Create a :class:`~vllm.SequenceGroup` object
from the list of :class:`~vllm.Sequence`.
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
Example:
>>> # initialize engine
>>> engine = LLMEngine.from_engine_args(engine_args)
>>> # set request arguments
>>> example_prompt = "Who is the president of the United States?"
>>> sampling_params = SamplingParams(temperature=0.0)
>>> request_id = 0
>>>
>>> # add the request to the engine
>>> engine.add_request(
>>> str(request_id),
>>> example_prompt,
>>> SamplingParams(temperature=0.0))
>>> # continue the request processing
>>> ...
"""
if
lora_request
is
not
None
and
not
self
.
lora_config
:
raise
ValueError
(
f
"Got lora_request
{
lora_request
}
but LoRA is "
"not enabled!"
)
if
arrival_time
is
None
:
arrival_time
=
time
.
monotonic
()
if
prompt_token_ids
is
None
:
assert
prompt
is
not
None
prompt_token_ids
=
self
.
tokenizer
.
encode
(
prompt
)
# Create the sequences.
block_size
=
self
.
cache_config
.
block_size
seq_id
=
next
(
self
.
seq_counter
)
seq
=
Sequence
(
seq_id
,
prompt
,
prompt_token_ids
,
block_size
,
lora_request
)
# Check whether the input specifies prefix
prefix
=
self
.
scheduler
.
prefix_pool
.
add_or_get_prefix
(
prompt_token_ids
[:
prefix_pos
],
lora_request
.
lora_int_id
if
lora_request
else
0
)
if
prefix_pos
is
not
None
else
None
# Create the sequence group.
seq_group
=
SequenceGroup
(
request_id
,
[
seq
],
sampling_params
,
arrival_time
,
lora_request
,
prefix
)
# Add the sequence group to the scheduler.
self
.
scheduler
.
add_seq_group
(
seq_group
)
def
abort_request
(
self
,
request_id
:
Union
[
str
,
Iterable
[
str
]])
->
None
:
"""Aborts a request(s) with the given ID.
Args:
request_id: The ID(s) of the request to abort.
Details:
- Refer to the
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
from class :class:`~vllm.core.scheduler.Scheduler`.
Example:
>>> # initialize engine and add a request with request_id
>>> request_id = str(0)
>>> # abort the request
>>> engine.abort_request(request_id)
"""
self
.
scheduler
.
abort_seq_group
(
request_id
)
def
get_model_config
(
self
)
->
ModelConfig
:
"""Gets the model configuration."""
return
self
.
model_config
def
get_num_unfinished_requests
(
self
)
->
int
:
"""Gets the number of unfinished requests."""
return
self
.
scheduler
.
get_num_unfinished_seq_groups
()
def
has_unfinished_requests
(
self
)
->
bool
:
"""Returns True if there are unfinished requests."""
return
self
.
scheduler
.
has_unfinished_seqs
()
def
_check_beam_search_early_stopping
(
self
,
early_stopping
:
Union
[
bool
,
str
],
sampling_params
:
SamplingParams
,
best_running_seq
:
Sequence
,
current_worst_seq
:
Sequence
,
)
->
bool
:
assert
sampling_params
.
use_beam_search
length_penalty
=
sampling_params
.
length_penalty
if
early_stopping
is
True
:
return
True
current_worst_score
=
(
current_worst_seq
.
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
get_tokenizer_for_seq
(
current_worst_seq
).
eos_token_id
))
if
early_stopping
is
False
:
highest_attainable_score
=
(
best_running_seq
.
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
get_tokenizer_for_seq
(
best_running_seq
).
eos_token_id
))
else
:
assert
early_stopping
==
"never"
if
length_penalty
>
0.0
:
# If length_penalty > 0.0, beam search will prefer longer
# sequences. The highest attainable score calculation is
# based on the longest possible sequence length in this case.
max_possible_length
=
max
(
best_running_seq
.
get_prompt_len
()
+
sampling_params
.
max_tokens
,
self
.
scheduler_config
.
max_model_len
)
highest_attainable_score
=
(
best_running_seq
.
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
get_tokenizer_for_seq
(
best_running_seq
).
eos_token_id
,
seq_len
=
max_possible_length
))
else
:
# Otherwise, beam search will prefer shorter sequences. The
# highest attainable score calculation is based on the current
# sequence length.
highest_attainable_score
=
(
best_running_seq
.
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
get_tokenizer_for_seq
(
best_running_seq
).
eos_token_id
))
def
_process_sequence_group_outputs
(
self
,
seq_group
:
SequenceGroup
,
outputs
:
SequenceGroupOutput
)
->
None
:
# Process prompt logprobs
prompt_logprobs
=
outputs
.
prompt_logprobs
if
prompt_logprobs
is
not
None
:
seq_group
.
prompt_logprobs
=
prompt_logprobs
# Process samples
samples
=
outputs
.
samples
parent_seqs
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
existing_finished_seqs
=
seq_group
.
get_finished_seqs
()
parent_child_dict
=
{
parent_seq
.
seq_id
:
[]
for
parent_seq
in
parent_seqs
}
for
sample
in
samples
:
parent_child_dict
[
sample
.
parent_seq_id
].
append
(
sample
)
# List of (child, parent)
child_seqs
:
List
[
Tuple
[
Sequence
,
Sequence
]]
=
[]
# Process the child samples for each parent sequence
for
parent
in
parent_seqs
:
child_samples
:
List
[
SequenceOutput
]
=
parent_child_dict
[
parent
.
seq_id
]
if
len
(
child_samples
)
==
0
:
# This parent sequence has no children samples. Remove
# the parent sequence from the sequence group since it will
# not be used in the future iterations.
parent
.
status
=
SequenceStatus
.
FINISHED_ABORTED
seq_group
.
remove
(
parent
.
seq_id
)
self
.
scheduler
.
free_seq
(
parent
)
continue
# Fork the parent sequence if there are multiple child samples.
for
child_sample
in
child_samples
[:
-
1
]:
new_child_seq_id
=
next
(
self
.
seq_counter
)
child
=
parent
.
fork
(
new_child_seq_id
)
child
.
append_token_id
(
child_sample
.
output_token
,
child_sample
.
logprobs
)
child_seqs
.
append
((
child
,
parent
))
# Continue the parent sequence for the last child sample.
# We reuse the parent sequence here to reduce redundant memory
# copies, especially when using non-beam search sampling methods.
last_child_sample
=
child_samples
[
-
1
]
parent
.
append_token_id
(
last_child_sample
.
output_token
,
last_child_sample
.
logprobs
)
child_seqs
.
append
((
parent
,
parent
))
for
seq
,
_
in
child_seqs
:
# self._decode_sequence(seq, seq_group.sampling_params)
self
.
_check_stop
(
seq
,
seq_group
.
sampling_params
)
# Non-beam search case
if
not
seq_group
.
sampling_params
.
use_beam_search
:
# For newly created child sequences, add them to the sequence group
# and fork them in block manager if they are not finished.
for
seq
,
parent
in
child_seqs
:
if
seq
is
not
parent
:
seq_group
.
add
(
seq
)
if
not
seq
.
is_finished
():
self
.
scheduler
.
fork_seq
(
parent
,
seq
)
# Free the finished and selected parent sequences' memory in block
# manager. Keep them in the sequence group as candidate output.
# NOTE: we need to fork the new sequences before freeing the
# old sequences.
for
seq
,
parent
in
child_seqs
:
if
seq
is
parent
and
seq
.
is_finished
():
self
.
scheduler
.
free_seq
(
seq
)
return
# Beam search case
# Select the child sequences to keep in the sequence group.
selected_child_seqs
=
[]
unselected_child_seqs
=
[]
beam_width
=
seq_group
.
sampling_params
.
best_of
length_penalty
=
seq_group
.
sampling_params
.
length_penalty
# Select the newly finished sequences with the highest scores
# to replace existing finished sequences.
# Tuple of (seq, parent, is_new)
existing_finished_seqs
=
[(
seq
,
None
,
False
)
for
seq
in
existing_finished_seqs
]
new_finished_seqs
=
[(
seq
,
parent
,
True
)
for
seq
,
parent
in
child_seqs
if
seq
.
is_finished
()]
all_finished_seqs
=
existing_finished_seqs
+
new_finished_seqs
# Sort the finished sequences by their scores.
all_finished_seqs
.
sort
(
key
=
lambda
x
:
x
[
0
].
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
get_tokenizer_for_seq
(
x
[
0
]).
eos_token_id
),
reverse
=
True
)
for
seq
,
parent
,
is_new
in
all_finished_seqs
[:
beam_width
]:
if
is_new
:
# A newly generated child sequence finishes and has a high
# score, so we will add it into the sequence group.
selected_child_seqs
.
append
((
seq
,
parent
))
for
seq
,
parent
,
is_new
in
all_finished_seqs
[
beam_width
:]:
if
is_new
:
# A newly generated child sequence finishes but has a low
# score, so we will not add it into the sequence group.
# Additionally, if this sequence is a continuation of a
# parent sequence, we will need remove the parent sequence
# from the sequence group.
unselected_child_seqs
.
append
((
seq
,
parent
))
else
:
# An existing finished sequence has a low score, so we will
# remove it from the sequence group.
seq_group
.
remove
(
seq
.
seq_id
)
# select the top beam_width sequences from the running
# sequences for the next iteration to continue the beam
# search.
running_child_seqs
=
[(
seq
,
parent
)
for
seq
,
parent
in
child_seqs
if
not
seq
.
is_finished
()]
# Sort the running sequences by their scores.
running_child_seqs
.
sort
(
key
=
lambda
x
:
x
[
0
].
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
get_tokenizer_for_seq
(
x
[
0
]).
eos_token_id
),
reverse
=
True
)
# Check if we can stop the beam search.
if
len
(
running_child_seqs
)
==
0
:
# No running sequences, stop the beam search.
stop_beam_search
=
True
elif
len
(
all_finished_seqs
)
<
beam_width
:
# Not enough finished sequences, continue the beam search.
stop_beam_search
=
False
else
:
# Check the early stopping criteria
best_running_seq
=
running_child_seqs
[
0
][
0
]
current_worst_seq
=
all_finished_seqs
[
beam_width
-
1
][
0
]
stop_beam_search
=
self
.
_check_beam_search_early_stopping
(
seq_group
.
sampling_params
.
early_stopping
,
seq_group
.
sampling_params
,
best_running_seq
,
current_worst_seq
)
if
stop_beam_search
:
# Stop the beam search and remove all the running sequences from
# the sequence group.
unselected_child_seqs
.
extend
(
running_child_seqs
)
else
:
# Continue the beam search and select the top beam_width sequences
# to continue the beam search.
selected_child_seqs
.
extend
(
running_child_seqs
[:
beam_width
])
# The remaining running sequences will not be used in the next
# iteration. Again, if these sequences are continuations of
# parent sequences, we will need to remove the parent sequences
# from the sequence group.
unselected_child_seqs
.
extend
(
running_child_seqs
[
beam_width
:])
# For newly created child sequences, add them to the sequence group
# and fork them in block manager if they are not finished.
for
seq
,
parent
in
selected_child_seqs
:
if
seq
is
not
parent
:
seq_group
.
add
(
seq
)
if
not
seq
.
is_finished
():
self
.
scheduler
.
fork_seq
(
parent
,
seq
)
# Free the finished and selected parent sequences' memory in block
# manager. Keep them in the sequence group as candidate output.
for
seq
,
parent
in
selected_child_seqs
:
if
seq
is
parent
and
seq
.
is_finished
():
self
.
scheduler
.
free_seq
(
seq
)
# Remove the unselected parent sequences from the sequence group and
# free their memory in block manager.
for
seq
,
parent
in
unselected_child_seqs
:
if
seq
is
parent
:
# Remove the parent sequence if it is not selected for next
# iteration
seq_group
.
remove
(
seq
.
seq_id
)
self
.
scheduler
.
free_seq
(
seq
)
def
_process_model_outputs
(
self
,
output
:
SamplerOutput
,
scheduler_outputs
:
SchedulerOutputs
)
->
List
[
RequestOutput
]:
# Update the scheduled sequence groups with the model outputs.
scheduled_seq_groups
=
scheduler_outputs
.
scheduled_seq_groups
for
seq_group
,
outputs
in
zip
(
scheduled_seq_groups
,
output
):
self
.
_process_sequence_group_outputs
(
seq_group
,
outputs
)
# Free the finished sequence groups.
self
.
scheduler
.
free_finished_seq_groups
()
# Create the outputs.
request_outputs
:
List
[
RequestOutput
]
=
[]
for
seq_group
in
scheduled_seq_groups
:
request_output
=
RequestOutput
.
from_seq_group
(
seq_group
)
request_outputs
.
append
(
request_output
)
for
seq_group
in
scheduler_outputs
.
ignored_seq_groups
:
request_output
=
RequestOutput
.
from_seq_group
(
seq_group
)
request_outputs
.
append
(
request_output
)
# Update prefix state, now all the uncomputed prefixes are computed.
for
seq_group
in
scheduled_seq_groups
:
if
(
seq_group
.
prefix
is
not
None
and
seq_group
.
prefix
.
allocated
and
not
seq_group
.
prefix
.
computed
):
seq_group
.
prefix
.
computed
=
True
# Log stats.
if
self
.
log_stats
:
self
.
stat_logger
.
log
(
self
.
_get_stats
(
scheduler_outputs
))
return
request_outputs
def
step
(
self
)
->
List
[
RequestOutput
]:
"""Performs one decoding iteration and returns newly generated results.
This function performs one decoding iteration of the engine. It first
schedules the sequences to be executed in the next iteration and the
token blocks to be swapped in/out/copy. Then, it executes the model
and updates the scheduler with the model outputs. Finally, it decodes
the sequences and returns the newly generated results.
"""
seq_group_metadata_list
,
scheduler_outputs
=
self
.
scheduler
.
schedule
()
if
not
scheduler_outputs
.
is_empty
():
output
=
self
.
worker
.
execute_model
(
seq_group_metadata_list
=
seq_group_metadata_list
,
# TODO: check this input
blocks_to_swap_in
=
scheduler_outputs
.
blocks_to_swap_in
,
blocks_to_swap_out
=
scheduler_outputs
.
blocks_to_swap_out
,
blocks_to_copy
=
scheduler_outputs
.
blocks_to_copy
,)
else
:
return
[
RequestOutput
.
from_seq_group
(
seq_group
)
for
seq_group
in
scheduler_outputs
.
ignored_seq_groups
]
return
self
.
_process_model_outputs
(
output
,
scheduler_outputs
)
def
do_log_stats
(
self
)
->
None
:
"""Forced log when no requests active."""
if
self
.
log_stats
:
self
.
stat_logger
.
log
(
self
.
_get_stats
(
scheduler_outputs
=
None
))
def
_get_stats
(
self
,
scheduler_outputs
:
Optional
[
SchedulerOutputs
])
->
Stats
:
"""Get Stats to be Logged to Prometheus."""
now
=
time
.
monotonic
()
# KV Cache Usage in %.
num_total_gpu
=
self
.
cache_config
.
num_gpu_blocks
num_free_gpu
=
self
.
scheduler
.
block_manager
.
get_num_free_gpu_blocks
()
gpu_cache_usage
=
1.0
-
(
num_free_gpu
/
num_total_gpu
)
num_total_cpu
=
self
.
cache_config
.
num_cpu_blocks
cpu_cache_usage
=
0.
if
num_total_cpu
>
0
:
num_free_cpu
=
self
.
scheduler
.
block_manager
.
get_num_free_cpu_blocks
()
cpu_cache_usage
=
1.0
-
(
num_free_cpu
/
num_total_cpu
)
# Scheduler State
num_running
=
len
(
self
.
scheduler
.
running
)
num_swapped
=
len
(
self
.
scheduler
.
swapped
)
num_waiting
=
len
(
self
.
scheduler
.
waiting
)
# Iteration stats if we have scheduler output.
num_prompt_tokens
=
0
num_generation_tokens
=
0
time_to_first_tokens
=
[]
time_per_output_tokens
=
[]
time_e2e_requests
=
[]
if
scheduler_outputs
is
not
None
:
prompt_run
=
scheduler_outputs
.
prompt_run
# Number of Tokens.
if
prompt_run
:
num_prompt_tokens
=
scheduler_outputs
.
num_batched_tokens
else
:
num_generation_tokens
=
scheduler_outputs
.
num_batched_tokens
# Latency Timings.
time_last_iters
=
[]
for
seq_group
in
scheduler_outputs
.
scheduled_seq_groups
:
# Time since last token. (n.b. updates seq_group.last_token_time)
time_last_iters
.
append
(
seq_group
.
get_last_latency
(
now
))
# Time since arrival for all finished requests.
if
seq_group
.
is_finished
():
time_e2e_requests
.
append
(
now
-
seq_group
.
arrival_time
)
time_to_first_tokens
=
time_last_iters
if
prompt_run
else
[]
time_per_output_tokens
=
[]
if
prompt_run
else
time_last_iters
return
Stats
(
now
=
now
,
num_running
=
num_running
,
num_swapped
=
num_swapped
,
num_waiting
=
num_waiting
,
gpu_cache_usage
=
gpu_cache_usage
,
cpu_cache_usage
=
cpu_cache_usage
,
num_prompt_tokens
=
num_prompt_tokens
,
num_generation_tokens
=
num_generation_tokens
,
time_to_first_tokens
=
time_to_first_tokens
,
time_per_output_tokens
=
time_per_output_tokens
,
time_e2e_requests
=
time_e2e_requests
,
)
# TODO: we may not need to decode
def
_decode_sequence
(
self
,
seq
:
Sequence
,
prms
:
SamplingParams
)
->
None
:
"""Decodes the new token for a sequence."""
(
new_tokens
,
new_output_text
,
prefix_offset
,
read_offset
)
=
detokenize_incrementally
(
self
.
get_tokenizer_for_seq
(
seq
),
all_input_ids
=
seq
.
get_token_ids
(),
prev_tokens
=
seq
.
tokens
,
prefix_offset
=
seq
.
prefix_offset
,
read_offset
=
seq
.
read_offset
,
skip_special_tokens
=
prms
.
skip_special_tokens
,
spaces_between_special_tokens
=
prms
.
spaces_between_special_tokens
,
)
if
seq
.
tokens
is
None
:
seq
.
tokens
=
new_tokens
else
:
seq
.
tokens
.
extend
(
new_tokens
)
seq
.
prefix_offset
=
prefix_offset
seq
.
read_offset
=
read_offset
seq
.
output_text
+=
new_output_text
def
_check_stop
(
self
,
seq
:
Sequence
,
sampling_params
:
SamplingParams
)
->
None
:
"""Stop the finished sequences."""
# for stop_str in sampling_params.stop:
# if seq.output_text.endswith(stop_str):
# self._finalize_sequence(seq, sampling_params, stop_str)
# seq.status = SequenceStatus.FINISHED_STOPPED
# return
# if seq.get_last_token_id() in sampling_params.stop_token_ids:
# stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(seq.get_last_token_id())
# self._finalize_sequence(seq, sampling_params, stop_str)
# seq.status = SequenceStatus.FINISHED_STOPPED
# return
# Check if the sequence has reached max_model_len.
if
seq
.
get_len
()
>
self
.
scheduler_config
.
max_model_len
:
seq
.
status
=
SequenceStatus
.
FINISHED_LENGTH_CAPPED
return
# Check if the sequence has reached max_tokens.
if
seq
.
get_output_len
()
==
sampling_params
.
max_tokens
:
seq
.
status
=
SequenceStatus
.
FINISHED_LENGTH_CAPPED
return
# Check if the sequence has generated the EOS token.
if
((
not
sampling_params
.
ignore_eos
)
and
seq
.
get_last_token_id
()
==
self
.
get_tokenizer_for_seq
(
seq
).
eos_token_id
):
seq
.
status
=
SequenceStatus
.
FINISHED_STOPPED
return
def
_finalize_sequence
(
self
,
seq
:
Sequence
,
sampling_params
:
SamplingParams
,
stop_string
:
str
)
->
None
:
if
not
sampling_params
.
include_stop_str_in_output
and
stop_string
:
# Truncate the output text so that the stop string is
# not included in the output.
seq
.
output_text
=
seq
.
output_text
[:
-
len
(
stop_string
)]
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
assert
lora_request
.
lora_int_id
>
0
,
"lora_id must be greater than 0."
return
self
.
worker
.
add_lora
(
lora_request
)
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
self
.
worker
.
remove_lora
(
lora_id
)
def
list_loras
(
self
)
->
List
[
int
]:
return
self
.
worker
.
list_loras
()
def
sync_model_weights
(
self
,
actor_weights
:
Dict
[
str
,
torch
.
Tensor
])
->
None
:
self
.
worker
.
sync_model_weights
(
actor_weights
=
actor_weights
)
def
offload_model_weights
(
self
)
->
None
:
self
.
worker
.
offload_model_weights
()
def
initialize_cluster
(
parallel_config
:
ParallelConfig
,
engine_use_ray
:
bool
=
False
,
ray_address
:
Optional
[
str
]
=
None
,
)
->
Tuple
[
str
,
Optional
[
None
]]:
"""Initialize the distributed cluster probably with Ray.
Args:
parallel_config: The configurations for parallel execution.
engine_use_ray: Whether to use Ray for async engine.
ray_address: The address of the Ray cluster. If None, uses
the default Ray cluster address.
Returns:
A tuple of (`distributed_init_method`, `placement_group`). The
`distributed_init_method` is the address for initializing the
distributed backend. `placement_group` includes the specification
of the resources for each distributed worker.
"""
# Initialize cluster locally.
port
=
get_open_port
()
# We need to setup the distributed init method to make sure
# the distributed megatron code (e.g., get world size) works correctly.
distributed_init_method
=
f
"tcp://localhost:
{
port
}
"
return
distributed_init_method
,
None
def
get_open_port
():
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
s
.
bind
((
""
,
0
))
return
s
.
getsockname
()[
1
]
verl/third_party/vllm/vllm_v_0_3_1/model_loader.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
"""Utilities for selecting and loading models."""
import
contextlib
from
typing
import
Dict
,
Type
,
Union
import
torch
import
torch.nn
as
nn
from
transformers
import
PretrainedConfig
,
PreTrainedModel
from
megatron.core.tensor_parallel.utils
import
VocabUtility
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.weight_utils
import
(
get_quant_config
,
initialize_dummy_weights
)
from
.config
import
ModelConfig
from
vllm.config
import
DeviceConfig
,
LoRAConfig
from
.weight_loaders
import
*
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
,
SamplingTensors
from
vllm.sequence
import
SamplerOutput
from
typing
import
Optional
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
_prune_hidden_states
,
_apply_logits_processors
,
_apply_penalties
,
_apply_top_k_top_p
,
_apply_min_p
,
_apply_penalties
,
_sample
,
_get_logprobs
,
_build_sampler_output
@
contextlib
.
contextmanager
def
_set_default_torch_dtype
(
dtype
:
torch
.
dtype
):
"""Sets the default torch dtype to the given dtype."""
old_dtype
=
torch
.
get_default_dtype
()
torch
.
set_default_dtype
(
dtype
)
yield
torch
.
set_default_dtype
(
old_dtype
)
def
_get_model_architecture
(
config
:
PretrainedConfig
)
->
Type
[
nn
.
Module
]:
architectures
=
getattr
(
config
,
"architectures"
,
[])
for
arch
in
architectures
:
model_cls
=
ModelRegistry
.
load_model_cls
(
arch
)
if
model_cls
is
not
None
:
return
model_cls
raise
ValueError
(
f
"Model architectures
{
architectures
}
are not supported for now. "
f
"Supported architectures:
{
ModelRegistry
.
get_supported_archs
()
}
"
)
from
vllm.model_executor.layers.linear
import
*
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
,
ParallelLMHead
from
vllm.model_executor.layers.activation
import
ScaledActivation
__LAYER_WEIGHT_LOADER_REGISTRY__
=
{
ColumnParallelLinear
:
parallel_weight_loader
,
MergedColumnParallelLinear
:
parallel_weight_loader
,
QKVParallelLinear
:
parallel_weight_loader
,
RowParallelLinear
:
parallel_weight_loader
,
VocabParallelEmbedding
:
parallel_weight_loader
,
ParallelLMHead
:
parallel_weight_loader
# "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
# "default_weight_loader": default_weight_loader
}
# NOTE(gmsheng): change the weight_loader function in runtime
for
layer_class
,
weight_loader
in
__LAYER_WEIGHT_LOADER_REGISTRY__
.
items
():
layer_class
.
weight_loader
=
weight_loader
__MODEL_WEIGHT_LOADER_REGISTRY__
=
{
'GPT2LMHeadModel'
:
gpt2_weight_loader
,
'LlamaForCausalLM'
:
llama_weight_loader
,
'LLaMAForCausalLM'
:
llama_weight_loader
,
'MistralForCausalLM'
:
mistral_weight_loader
,
}
# FIXME(shengguangming): the vLLM vocab will pad to 64, which may incur out of bounds
# so we need to rewrite the init function of vocab
DEFAULT_VOCAB_PADDING_SIZE
=
64
def
vocab_init
(
self
,
num_embeddings
:
int
,
embedding_dim
:
int
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
org_num_embeddings
:
Optional
[
int
]
=
None
,
padding_size
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
):
super
(
VocabParallelEmbedding
,
self
).
__init__
()
# Keep the input dimensions.
# TODO (pad to be divided by 4)
self
.
num_embeddings
=
num_embeddings
self
.
org_vocab_size
=
org_num_embeddings
or
num_embeddings
# self.num_embeddings_padded = pad_vocab_size(num_embeddings,
# padding_size)
self
.
embedding_dim
=
embedding_dim
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
# Divide the weight matrix along the vocaburaly dimension.
self
.
vocab_start_index
,
self
.
vocab_end_index
=
(
VocabUtility
.
vocab_range_from_global_vocab_size
(
self
.
num_embeddings
,
get_tensor_model_parallel_rank
(),
self
.
tp_size
))
self
.
num_embeddings_per_partition
=
(
self
.
vocab_end_index
-
self
.
vocab_start_index
)
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
# device=torch.cuda.current_device(),
dtype
=
params_dtype
))
set_weight_attrs
(
self
.
weight
,
{
"parallel_dim"
:
0
,
"weight_loader"
:
self
.
weight_loader
})
VocabParallelEmbedding
.
__init__
=
vocab_init
def
_get_model_weight_loader
(
arch
:
str
):
if
arch
in
__MODEL_WEIGHT_LOADER_REGISTRY__
:
return
__MODEL_WEIGHT_LOADER_REGISTRY__
[
arch
]
raise
ValueError
(
f
"Model architectures
{
arch
}
are not supported for now. "
f
"Supported architectures:
{
ModelRegistry
.
get_supported_archs
()
}
"
)
def
get_model
(
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
)
->
nn
.
Module
:
model_class
=
_get_model_architecture
(
model_config
.
hf_config
)
# Get the quantization config.
linear_method
=
None
quant_config
=
None
if
model_config
.
quantization
is
not
None
:
quant_config
=
get_quant_config
(
model_config
.
quantization
,
model_config
.
model
,
model_config
.
hf_config
,
model_config
.
download_dir
)
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
if
capability
<
quant_config
.
get_min_capability
():
raise
ValueError
(
f
"The quantization method
{
model_config
.
quantization
}
is not "
"supported for the current GPU. "
f
"Minimum capability:
{
quant_config
.
get_min_capability
()
}
. "
f
"Current capability:
{
capability
}
."
)
supported_dtypes
=
quant_config
.
get_supported_act_dtypes
()
if
model_config
.
dtype
not
in
supported_dtypes
:
raise
ValueError
(
f
"
{
model_config
.
dtype
}
is not supported for quantization "
f
"method
{
model_config
.
quantization
}
. Supported dtypes: "
f
"
{
supported_dtypes
}
"
)
linear_method
=
quant_config
.
get_linear_method
()
with
_set_default_torch_dtype
(
model_config
.
dtype
):
# Create a model instance.
# The weights will be initialized as empty tensors.
# with torch.device(device_config.device):
# NOTE(sgm): init the model in cpu
model
=
model_class
(
model_config
.
hf_config
,
linear_method
)
if
model_config
.
load_format
==
"dummy"
:
model
=
model
.
cuda
()
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
initialize_dummy_weights
(
model
)
elif
model_config
.
load_format
==
'model'
or
model_config
.
load_format
==
'auto'
:
# NOTE(shengguangming) Load the weights from the actor model
if
isinstance
(
actor_model
,
nn
.
Module
):
load_weights
(
actor_weights
=
dict
(
actor_model
.
named_parameters
(
remove_duplicate
=
False
)),
vllm_model
=
model
)
else
:
load_weights
(
actor_weights
=
actor_model
,
vllm_model
=
model
)
# NOTE(sgm) Some weights are point to gpu, but still need this.
model
=
model
.
cuda
()
# NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
return
model
.
eval
()
# the actor model is .state_dict()
def
load_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
weight_loader
=
_get_model_weight_loader
(
vllm_model
.
__class__
.
__name__
)
weight_loader
(
actor_weights
,
vllm_model
)
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
# after init, and we need this after sync model weights for in first iter.
vllm_model
=
vllm_model
.
cuda
()
# FIXME(sgm): hack the Sampler function in vllm v0.3.1
# as they use ray, the sampler result will only need to return to the driver node,
# therefore gather is enough. However, we use SPMD instead of a central scheduler,
# all_gather is required (aligned with v0.2.6)
def
_get_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
embedding
:
torch
.
Tensor
,
embedding_bias
:
Optional
[
torch
.
Tensor
])
->
torch
.
Tensor
:
# Get the logits for the next tokens.
logits
=
torch
.
matmul
(
hidden_states
,
embedding
.
t
())
if
embedding_bias
is
not
None
:
logits
+=
embedding_bias
logits
=
tensor_model_parallel_all_gather
(
logits
)
# Remove paddings in vocab (if any).
if
logits
is
not
None
:
logits
=
logits
[:,
:
self
.
org_vocab_size
]
return
logits
def
forward
(
self
,
embedding
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
embedding_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Optional
[
SamplerOutput
]:
# Get the hidden states that we use for sampling.
hidden_states
=
_prune_hidden_states
(
hidden_states
,
sampling_metadata
)
# Get the logits for the next tokens.
logits
=
self
.
_get_logits
(
hidden_states
,
embedding
,
embedding_bias
)
# save origin logprobs for sampler_output
origin_logprobs
=
torch
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float
)
# Only perform sampling in the driver worker.
# Note: `_get_logits` is still distributed across TP workers because
# the `embedding` weight is distributed across TP workers.
# TODO(zhuohan): Change the get_logits part to a separate stage.
if
not
sampling_metadata
.
perform_sampling
:
return
None
assert
logits
is
not
None
_
,
vocab_size
=
logits
.
shape
# Apply logits processors (if any).
logits
=
_apply_logits_processors
(
logits
,
sampling_metadata
)
# Prepare sampling tensors with pinned memory to avoid blocking.
(
sampling_tensors
,
do_penalties
,
do_top_p_top_k
,
do_min_p
)
=
SamplingTensors
.
from_sampling_metadata
(
sampling_metadata
,
vocab_size
,
logits
.
device
,
logits
.
dtype
)
# Apply presence and frequency penalties.
if
do_penalties
:
logits
=
_apply_penalties
(
logits
,
sampling_tensors
.
prompt_tokens
,
sampling_tensors
.
output_tokens
,
sampling_tensors
.
presence_penalties
,
sampling_tensors
.
frequency_penalties
,
sampling_tensors
.
repetition_penalties
)
# Apply temperature scaling.
# Use in-place division to avoid creating a new tensor.
logits
.
div_
(
sampling_tensors
.
temperatures
.
unsqueeze_
(
dim
=
1
))
if
do_top_p_top_k
:
logits
=
_apply_top_k_top_p
(
logits
,
sampling_tensors
.
top_ps
,
sampling_tensors
.
top_ks
)
if
do_min_p
:
logits
=
_apply_min_p
(
logits
,
sampling_tensors
.
min_ps
)
# We use float32 for probabilities and log probabilities.
# Compute the probabilities.
probs
=
torch
.
softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float
)
# Compute the log probabilities.
# Use log_softmax to ensure numerical stability.
logprobs
=
torch
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float
)
# Sample the next tokens.
sample_results
=
_sample
(
probs
,
logprobs
,
sampling_metadata
)
# Get the logprobs query results.
# prompt_logprobs, sample_logprobs = _get_logprobs(
# logprobs, sampling_metadata, sample_results)
prompt_logprobs
,
sample_logprobs
=
_get_logprobs
(
origin_logprobs
,
sampling_metadata
,
sample_results
)
return
_build_sampler_output
(
sample_results
,
sampling_metadata
,
prompt_logprobs
,
sample_logprobs
)
from
vllm.model_executor.layers.sampler
import
Sampler
Sampler
.
_get_logits
=
_get_logits
Sampler
.
forward
=
forward
verl/third_party/vllm/vllm_v_0_3_1/model_runner.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Set
,
Union
import
contextlib
import
time
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
vllm.config
import
(
DeviceConfig
,
ModelConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
InputMetadata
,
SamplingMetadata
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.sequence
import
SamplerOutput
,
SequenceData
,
SequenceGroupMetadata
from
vllm.lora.worker_manager
import
LRUCacheWorkerLoRAManager
from
vllm.lora.layers
import
LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
in_wsl
from
vllm.worker.model_runner
import
ModelRunner
,
CUDAGraphRunner
,
_async_h2d
from
.model_loader
import
get_model
logger
=
init_logger
(
__name__
)
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
_PAD_SLOT_ID
=
-
1
LORA_WARMUP_RANK
=
8
# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
_BATCH_SIZES_TO_CAPTURE
=
[
1
,
2
,
4
]
+
[
8
*
i
for
i
in
range
(
1
,
33
)]
class
ModelRunner
(
ModelRunner
):
def
__init__
(
self
,
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
):
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
lora_config
=
lora_config
# model_config can be None in tests/samplers/test_sampler.py.
# FIXME(woosuk): This is a hack to make the tests work. Refactor this.
self
.
sliding_window
=
(
model_config
.
get_sliding_window
()
if
model_config
is
not
None
else
None
)
self
.
device_config
=
(
device_config
if
device_config
is
not
None
else
DeviceConfig
())
self
.
device
=
self
.
device_config
.
device
self
.
model
=
model
# this will be replaced by get_model()
self
.
block_size
=
None
# Set after initial profiling.
self
.
lora_manager
=
None
self
.
graph_runners
:
Dict
[
int
,
CUDAGraphRunner
]
=
{}
self
.
graph_memory_pool
=
None
# Set during graph capture.
self
.
max_context_len_to_capture
=
(
self
.
model_config
.
max_context_len_to_capture
if
self
.
model_config
is
not
None
else
0
)
# When using CUDA graph, the input block tables must be padded to
# max_context_len_to_capture. However, creating the block table in
# Python can be expensive. To optimize this, we cache the block table
# in numpy and only copy the actual input content at every iteration.
# The shape of the cached block table will be
# (max batch size to capture, max context len to capture / block size).
self
.
graph_block_tables
=
None
# Set after initial profiling.
# cache in_wsl result
self
.
in_wsl
=
in_wsl
()
self
.
kv_cache_dtype
=
kv_cache_dtype
def
load_model
(
self
)
->
None
:
self
.
model
=
get_model
(
actor_model
=
self
.
model
,
model_config
=
self
.
model_config
,
device_config
=
self
.
device_config
,
lora_config
=
self
.
lora_config
)
vocab_size
=
self
.
model
.
config
.
vocab_size
if
self
.
lora_config
:
assert
hasattr
(
self
.
model
,
"supported_lora_modules"
)
and
self
.
model
.
supported_lora_modules
,
"Model does not support LoRA"
assert
hasattr
(
self
.
model
,
"embedding_modules"
),
"Model does not have embedding_modules"
assert
hasattr
(
self
.
model
,
"embedding_padding_modules"
),
"Model does not have embedding_padding_modules"
self
.
lora_manager
=
LRUCacheWorkerLoRAManager
(
self
.
scheduler_config
.
max_num_seqs
,
self
.
scheduler_config
.
max_num_batched_tokens
+
self
.
scheduler_config
.
max_paddings
,
vocab_size
,
self
.
lora_config
,
self
.
device
,
self
.
model
.
embedding_modules
,
self
.
model
.
embedding_padding_modules
)
self
.
model
=
self
.
lora_manager
.
create_lora_manager
(
self
.
model
)
def
_prepare_sample
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
prompt_lens
:
List
[
int
],
subquery_lens
:
Optional
[
List
[
int
]],
)
->
SamplingMetadata
:
seq_groups
:
List
[
Tuple
[
List
[
int
],
SamplingParams
]]
=
[]
selected_token_indices
:
List
[
int
]
=
[]
selected_token_start_idx
=
0
categorized_sample_indices
=
{
t
:
[]
for
t
in
SamplingType
}
categorized_sample_indices_start_idx
=
0
max_subquery_len
=
max
(
subquery_lens
)
if
subquery_lens
else
1
for
i
,
seq_group_metadata
in
enumerate
(
seq_group_metadata_list
):
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
sampling_params
=
seq_group_metadata
.
sampling_params
seq_groups
.
append
((
seq_ids
,
sampling_params
))
if
seq_group_metadata
.
is_prompt
:
assert
len
(
seq_ids
)
==
1
assert
subquery_lens
is
not
None
subquery_len
=
subquery_lens
[
i
]
if
sampling_params
.
prompt_logprobs
is
not
None
:
# NOTE: prompt token positions do not need sample, skip
categorized_sample_indices_start_idx
+=
subquery_len
-
1
categorized_sample_indices
[
sampling_params
.
sampling_type
].
append
(
categorized_sample_indices_start_idx
)
categorized_sample_indices_start_idx
+=
1
if
sampling_params
.
prompt_logprobs
is
not
None
:
selected_token_indices
.
extend
(
range
(
selected_token_start_idx
,
selected_token_start_idx
+
subquery_len
-
1
))
selected_token_indices
.
append
(
selected_token_start_idx
+
subquery_len
-
1
)
selected_token_start_idx
+=
max_subquery_len
else
:
num_seqs
=
len
(
seq_ids
)
selected_token_indices
.
extend
(
range
(
selected_token_start_idx
,
selected_token_start_idx
+
num_seqs
))
selected_token_start_idx
+=
num_seqs
categorized_sample_indices
[
sampling_params
.
sampling_type
].
extend
(
range
(
categorized_sample_indices_start_idx
,
categorized_sample_indices_start_idx
+
num_seqs
))
categorized_sample_indices_start_idx
+=
num_seqs
selected_token_indices
=
_async_h2d
(
selected_token_indices
,
dtype
=
torch
.
long
,
target_device
=
self
.
device
,
pin_memory
=
not
self
.
in_wsl
)
categorized_sample_indices
=
{
t
:
_async_h2d
(
seq_ids
,
dtype
=
torch
.
int
,
target_device
=
self
.
device
,
pin_memory
=
not
self
.
in_wsl
)
for
t
,
seq_ids
in
categorized_sample_indices
.
items
()
}
seq_data
:
Dict
[
int
,
SequenceData
]
=
{}
for
seq_group_metadata
in
seq_group_metadata_list
:
seq_data
.
update
(
seq_group_metadata
.
seq_data
)
sampling_metadata
=
SamplingMetadata
(
seq_groups
=
seq_groups
,
seq_data
=
seq_data
,
prompt_lens
=
prompt_lens
,
selected_token_indices
=
selected_token_indices
,
categorized_sample_indices
=
categorized_sample_indices
,
)
return
sampling_metadata
def
prepare_input_tensors
(
self
,
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
InputMetadata
,
SamplingMetadata
,
Set
[
int
],
LoRAMapping
]:
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt
=
seq_group_metadata_list
[
0
].
is_prompt
# Prepare input tensors.
if
is_prompt
:
(
input_tokens
,
input_positions
,
input_metadata
,
prompt_lens
,
subquery_lens
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
)
=
self
.
_prepare_prompt
(
seq_group_metadata_list
)
else
:
(
input_tokens
,
input_positions
,
input_metadata
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
)
=
self
.
_prepare_decode
(
seq_group_metadata_list
)
prompt_lens
=
[]
subquery_lens
=
None
sampling_metadata
=
self
.
_prepare_sample
(
seq_group_metadata_list
,
prompt_lens
,
subquery_lens
)
if
self
.
lora_config
:
flat_lora_index_mapping
=
[
item
for
sublist
in
lora_index_mapping
for
item
in
sublist
]
lora_mapping
=
LoRAMapping
(
flat_lora_index_mapping
,
lora_prompt_mapping
,
)
else
:
lora_mapping
=
None
return
(
input_tokens
,
input_positions
,
input_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
)
@
torch
.
inference_mode
()
def
execute_model
(
self
,
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]],
kv_caches
:
List
[
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]],
)
->
Optional
[
SamplerOutput
]:
(
input_tokens
,
input_positions
,
input_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
)
=
self
.
prepare_input_tensors
(
seq_group_metadata_list
)
if
self
.
lora_config
:
self
.
set_active_loras
(
lora_requests
,
lora_mapping
)
# Execute the model.
if
input_metadata
.
use_cuda_graph
:
graph_batch_size
=
input_tokens
.
shape
[
0
]
model_executable
=
self
.
graph_runners
[
graph_batch_size
]
else
:
model_executable
=
self
.
model
hidden_states
=
model_executable
(
input_ids
=
input_tokens
,
positions
=
input_positions
,
kv_caches
=
kv_caches
,
input_metadata
=
input_metadata
,
)
# Sample the next token.
output
=
self
.
model
.
sample
(
hidden_states
=
hidden_states
,
sampling_metadata
=
sampling_metadata
,
)
return
output
@
torch
.
inference_mode
()
def
profile_run
(
self
)
->
None
:
# Enable top-k sampling to reflect the accurate memory usage.
vocab_size
=
self
.
model_config
.
get_vocab_size
()
# FIXME(sgm): this sampling params will call cumsum(), causing the
# deterministic cumsum throw error
sampling_params
=
SamplingParams
(
top_p
=
0.99
,
top_k
=
vocab_size
-
1
)
max_num_batched_tokens
=
self
.
scheduler_config
.
max_num_batched_tokens
max_num_seqs
=
self
.
scheduler_config
.
max_num_seqs
# This represents the maximum number of different requests
# that will have unique loras, an therefore the max amount of memory
# consumption create dummy lora request copies from the lora request
# passed in, which contains a lora from the lora warmup path.
dummy_lora_requests
=
[]
dummy_lora_requests_per_seq
=
[]
if
self
.
lora_config
:
for
idx
in
range
(
self
.
lora_config
.
max_loras
):
lora_id
=
idx
+
1
dummy_lora_request
=
LoRARequest
(
lora_name
=
f
"warmup_
{
lora_id
}
"
,
lora_int_id
=
lora_id
,
lora_local_path
=
"/not/a/real/path"
,
)
self
.
lora_manager
.
add_dummy_lora
(
dummy_lora_request
,
rank
=
LORA_WARMUP_RANK
)
dummy_lora_requests
.
append
(
dummy_lora_request
)
dummy_lora_requests_per_seq
=
[
dummy_lora_requests
[
idx
%
len
(
dummy_lora_requests
)]
for
idx
in
range
(
max_num_seqs
)
]
# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
seqs
:
List
[
SequenceGroupMetadata
]
=
[]
for
group_id
in
range
(
max_num_seqs
):
seq_len
=
(
max_num_batched_tokens
//
max_num_seqs
+
(
group_id
<
max_num_batched_tokens
%
max_num_seqs
))
seq_data
=
SequenceData
([
0
]
*
seq_len
)
seq
=
SequenceGroupMetadata
(
request_id
=
str
(
group_id
),
is_prompt
=
True
,
seq_data
=
{
group_id
:
seq_data
},
sampling_params
=
sampling_params
,
block_tables
=
None
,
lora_request
=
dummy_lora_requests_per_seq
[
group_id
]
if
dummy_lora_requests_per_seq
else
None
,
)
seqs
.
append
(
seq
)
# Run the model with the dummy inputs.
num_layers
=
self
.
model_config
.
get_num_layers
(
self
.
parallel_config
)
kv_caches
=
[(
None
,
None
)]
*
num_layers
self
.
execute_model
(
seqs
,
kv_caches
)
torch
.
cuda
.
synchronize
()
return
verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Model and data parallel groups."""
import
torch
import
torch.distributed
import
vllm.model_executor.parallel_utils.parallel_state
as
ps
"""
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
- We assume the Megatron tp+dp+pp world is already established before calling this function.
"""
# Tensor model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP
=
None
# Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp
# into infer_tp and micro_tp. By default, we use order micro_dp - tp
_MICRO_DATA_PARALLEL_GROUP
=
None
def
initialize_model_parallel_from_megatron
(
tensor_model_parallel_size
=
None
# we set None for backward compatibility to set infer_tp = train_tp
)
->
None
:
from
megatron.core
import
parallel_state
as
mpu
from
megatron.distributed
import
new_group
# Get world size and rank. Ensure some consistencies.
assert
torch
.
distributed
.
is_initialized
()
if
tensor_model_parallel_size
is
None
:
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
else
:
assert
isinstance
(
tensor_model_parallel_size
,
int
)
# Build the tensor model-parallel groups.
assert
ps
.
_TENSOR_MODEL_PARALLEL_GROUP
is
None
,
(
"tensor model parallel group is already initialized"
)
assert
tensor_model_parallel_size
<=
mpu
.
get_tensor_model_parallel_world_size
(
),
'Not implemented for infer_tp > train_tp'
global
_TENSOR_MODEL_PARALLEL_GROUP
global
_MICRO_DATA_PARALLEL_GROUP
assert
mpu
.
get_tensor_model_parallel_world_size
()
%
tensor_model_parallel_size
==
0
micro_dp_size
=
mpu
.
get_tensor_model_parallel_world_size
()
//
tensor_model_parallel_size
world_size
:
int
=
torch
.
distributed
.
get_world_size
()
num_micro_dp_groups
=
world_size
//
micro_dp_size
rank
=
torch
.
distributed
.
get_rank
()
# Build the micro dp groups.
assert
_MICRO_DATA_PARALLEL_GROUP
is
None
,
(
"micro data parallel group is already initialized"
)
for
i
in
range
(
num_micro_dp_groups
):
ranks
=
range
(
i
*
micro_dp_size
,
(
i
+
1
)
*
micro_dp_size
)
group
=
new_group
(
rank
=
rank
,
ranks
=
ranks
,
group_type
=
'micro_dp'
)
if
rank
in
ranks
:
_MICRO_DATA_PARALLEL_GROUP
=
group
if
tensor_model_parallel_size
==
mpu
.
get_tensor_model_parallel_world_size
():
# using the same tp group as Megatron
ps
.
_TENSOR_MODEL_PARALLEL_GROUP
=
mpu
.
get_tensor_model_parallel_group
()
_TENSOR_MODEL_PARALLEL_GROUP
=
mpu
.
get_tensor_model_parallel_group
()
# no _MICRO_DATA_PARALLEL_GROUP
else
:
# initialize a micro_dp group and a tp group
# assume training tp=4, infer tp=2, then, weight is partitioned as
# [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
# Build the inference tp groups
train_tp
=
mpu
.
get_tensor_model_parallel_world_size
()
num_tensor_model_parallel_groups_per_train_tp
=
train_tp
//
tensor_model_parallel_size
num_tensor_model_parallel_groups
=
world_size
//
tensor_model_parallel_size
assert
_TENSOR_MODEL_PARALLEL_GROUP
is
None
,
(
"tensor model parallel group is already initialized"
)
for
i
in
range
(
num_tensor_model_parallel_groups
//
num_tensor_model_parallel_groups_per_train_tp
):
start
=
train_tp
*
i
end
=
train_tp
*
(
i
+
1
)
for
j
in
range
(
num_tensor_model_parallel_groups_per_train_tp
):
ranks
=
list
(
range
(
start
,
end
,
num_tensor_model_parallel_groups_per_train_tp
))
for
i
in
range
(
len
(
ranks
)):
ranks
[
i
]
+=
j
# group = torch.distributed.new_group(ranks)
group
=
new_group
(
rank
=
rank
,
ranks
=
ranks
,
group_type
=
'infer_tp'
)
if
rank
in
ranks
:
_TENSOR_MODEL_PARALLEL_GROUP
=
group
ps
.
_TENSOR_MODEL_PARALLEL_GROUP
=
_TENSOR_MODEL_PARALLEL_GROUP
# Build the pipeline model-parallel groups.
# global _PIPELINE_MODEL_PARALLEL_GROUP
# global _PIPELINE_GLOBAL_RANKS
# assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
# ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
# ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
"""
Tensor model parallel utilities
"""
def
get_tensor_model_parallel_group
():
"""Get the tensor model parallel group the caller rank belongs to."""
assert
_TENSOR_MODEL_PARALLEL_GROUP
is
not
None
,
(
"tensor model parallel group is not initialized"
)
return
_TENSOR_MODEL_PARALLEL_GROUP
def
get_tensor_model_parallel_world_size
():
"""Return world size for the tensor model parallel group."""
return
torch
.
distributed
.
get_world_size
(
group
=
get_tensor_model_parallel_group
())
def
get_tensor_model_parallel_rank
():
"""Return my rank for the tensor model parallel group."""
return
torch
.
distributed
.
get_rank
(
group
=
get_tensor_model_parallel_group
())
def
get_tensor_model_parallel_src_rank
():
"""Calculate the global rank corresponding to the first local rank
in the tensor model parallel group."""
global_rank
=
torch
.
distributed
.
get_rank
()
local_world_size
=
get_tensor_model_parallel_world_size
()
return
(
global_rank
//
local_world_size
)
*
local_world_size
"""
Micro Data parallel group
"""
def
get_micro_data_parallel_group
():
assert
_MICRO_DATA_PARALLEL_GROUP
is
not
None
return
_MICRO_DATA_PARALLEL_GROUP
def
get_micro_data_parallel_world_size
():
return
torch
.
distributed
.
get_world_size
(
group
=
get_micro_data_parallel_group
())
def
get_micro_data_parallel_rank
():
return
torch
.
distributed
.
get_rank
(
group
=
get_micro_data_parallel_group
())
verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
from
typing
import
List
,
Optional
,
Tuple
,
Union
from
transformers
import
(
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
make_async
,
LRUCache
from
vllm.transformers_utils.tokenizers
import
*
class
TokenizerGroup
:
"""A group of tokenizers that can be used for LoRA adapters."""
def
__init__
(
self
,
tokenizer
:
PreTrainedTokenizer
,
enable_lora
:
bool
,
max_num_seqs
:
int
,
max_input_length
:
Optional
[
int
]):
self
.
enable_lora
=
enable_lora
self
.
max_input_length
=
max_input_length
self
.
tokenizer
=
tokenizer
if
enable_lora
:
self
.
lora_tokenizers
=
LRUCache
(
capacity
=
max_num_seqs
)
else
:
self
.
lora_tokenizers
=
None
def
encode
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
tokenizer
=
self
.
get_lora_tokenizer
(
lora_request
)
return
tokenizer
.
encode
(
prompt
)
async
def
encode_async
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
tokenizer
=
await
self
.
get_lora_tokenizer_async
(
lora_request
)
return
tokenizer
.
encode
(
prompt
)
def
get_lora_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
])
->
"PreTrainedTokenizer"
:
if
not
lora_request
or
not
self
.
enable_lora
:
return
self
.
tokenizer
if
lora_request
.
lora_int_id
not
in
self
.
lora_tokenizers
:
# TODO(sgm): the lora tokenizer is also passed, but may be different
tokenizer
=
self
.
tokenizer
# tokenizer = (get_lora_tokenizer(
# lora_request, **self.tokenizer_config) or self.tokenizer)
self
.
lora_tokenizers
.
put
(
lora_request
.
lora_int_id
,
tokenizer
)
return
tokenizer
else
:
return
self
.
lora_tokenizers
.
get
(
lora_request
.
lora_int_id
)
# FIXME(sgm): for simplicity, we assign the special token here
@
property
def
pad_token_id
(
self
):
return
self
.
tokenizer
.
pad_token_id
@
property
def
eos_token_id
(
self
):
return
self
.
tokenizer
.
eos_token_id
verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from
typing
import
Dict
import
torch
import
torch.nn
as
nn
# NOTE(shengguangming): replace the origin weight loader function in the class
def
parallel_weight_loader
(
self
,
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
)
->
None
:
"""Parallel Linear weight loader."""
assert
param
.
size
()
==
loaded_weight
.
size
(
),
'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'
.
format
(
param
.
size
(),
loaded_weight
.
size
())
assert
param
.
data
.
dtype
==
loaded_weight
.
data
.
dtype
,
"if we want to shared weights, the data type should also be the same"
param
.
data
=
loaded_weight
.
data
def
default_weight_loader
(
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
)
->
None
:
"""Default weight loader."""
assert
param
.
size
()
==
loaded_weight
.
size
()
assert
param
.
data
.
dtype
==
loaded_weight
.
data
.
dtype
,
"if we want to shared weights, the data type should also be the same"
param
.
data
=
loaded_weight
.
data
def
gpt2_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"lm_head.weight"
in
name
:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
continue
if
".attn.bias"
in
name
or
".attn.masked_bias"
in
name
:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
if
not
name
.
startswith
(
"transformer."
):
name
=
"transformer."
+
name
param
=
params_dict
[
name
]
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
# Because of this, we need to transpose the weights.
# Note(zhuohan): the logic below might break quantized models.
for
conv1d_weight_name
in
[
"c_attn"
,
"c_proj"
,
"c_fc"
]:
if
conv1d_weight_name
not
in
name
:
continue
if
not
name
.
endswith
(
".weight"
):
continue
# TODO: check megatron
loaded_weight
=
loaded_weight
.
t
()
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
# NOTE(shengguangming): the megatron llama may have this prefix
prefix
=
'0.module.module.'
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
name
[:
len
(
prefix
)]
==
prefix
:
name
=
name
[
len
(
prefix
):]
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
mistral_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
# TODO: need to implement a general way to deal with prefix
prefix
=
'0.module.module.'
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
name
[:
len
(
prefix
)]
==
prefix
:
name
=
name
[
len
(
prefix
):]
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
verl/third_party/vllm/vllm_v_0_3_1/worker.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
"""A GPU worker class."""
import
os
import
gc
from
typing
import
Dict
,
List
,
Tuple
,
Optional
,
Union
,
Set
import
torch
import
torch.distributed
import
torch.nn
as
nn
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
LoRAConfig
)
from
vllm.model_executor
import
InputMetadata
,
set_random_seed
from
vllm.model_executor.parallel_utils.parallel_state
import
(
initialize_model_parallel
)
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.sequence
import
SamplerOutput
,
SequenceData
,
SequenceGroupMetadata
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.model_executor.parallel_utils.custom_all_reduce
import
init_custom_ar
from
vllm.model_executor.parallel_utils.parallel_state
import
get_tensor_model_parallel_group
from
.model_runner
import
ModelRunner
from
.model_loader
import
load_weights
from
.parallel_state
import
initialize_model_parallel_from_megatron
from
vllm.lora.request
import
LoRARequest
class
Worker
:
"""A worker class that executes (a partition of) the model on a GPU.
Each worker is associated with a single GPU. The worker is responsible for
maintaining the KV cache and executing the model on the GPU. In case of
distributed inference, each worker is assigned a partition of the model.
"""
def
__init__
(
self
,
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
rank
:
Optional
[
int
]
=
None
,
distributed_init_method
:
Optional
[
str
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
)
->
None
:
# self.model = model # will be replaced in the init_model
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
rank
=
rank
self
.
distributed_init_method
=
distributed_init_method
self
.
lora_config
=
lora_config
self
.
model_runner
=
ModelRunner
(
model
,
model_config
,
parallel_config
,
scheduler_config
,
device_config
,
lora_config
=
self
.
lora_config
,
kv_cache_dtype
=
kv_cache_dtype
,
)
# Uninitialized cache engine. Will be initialized by
# self.init_cache_engine().
self
.
cache_config
=
None
self
.
block_size
=
None
self
.
sliding_window
=
None
self
.
cache_engine
=
None
self
.
cache_events
=
None
self
.
gpu_cache
=
None
# For offloading inference engine params
self
.
cpu_model
=
None
def
init_model
(
self
,
cupy_port
:
Optional
[
int
]
=
None
):
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os
.
environ
[
"TORCH_NCCL_AVOID_RECORD_STREAMS"
]
=
"1"
# Env vars will be set by TORCHRUN.
self
.
rank
=
self
.
rank
if
self
.
rank
is
not
None
else
int
(
os
.
getenv
(
"RANK"
,
"-1"
))
local_rank
=
int
(
os
.
getenv
(
"LOCAL_RANK"
,
"0"
))
self
.
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
if
self
.
rank
<
0
:
raise
ValueError
(
"Invalid or unspecified rank."
)
torch
.
cuda
.
set_device
(
self
.
device
)
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
# Initialize the distributed environment.
# TODO: do not use cupy
_init_distributed_environment
(
self
.
parallel_config
,
self
.
rank
,
self
.
distributed_init_method
)
if
not
self
.
parallel_config
.
disable_custom_all_reduce
:
init_custom_ar
()
# Initialize the model.
set_random_seed
(
self
.
model_config
.
seed
)
# self.model = get_model(actor_model=self.model, model_config=self.model_config)
def
load_model
(
self
):
self
.
model_runner
.
load_model
()
@
torch
.
inference_mode
()
def
profile_num_available_blocks
(
self
,
block_size
:
int
,
gpu_memory_utilization
:
float
,
cpu_swap_space
:
int
,
cache_dtype
:
str
,
)
->
Tuple
[
int
,
int
]:
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
torch
.
cuda
.
empty_cache
()
# torch.cuda.reset_peak_memory_stats()
# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
self
.
model_runner
.
profile_run
()
# Calculate the number of blocks that can be allocated with the
# profiled peak memory.
torch
.
cuda
.
synchronize
()
free_gpu_memory
,
total_gpu_memory
=
torch
.
cuda
.
mem_get_info
()
peak_memory
=
total_gpu_memory
-
free_gpu_memory
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
block_size
,
cache_dtype
,
self
.
model_config
,
self
.
parallel_config
)
# NOTE(sgm) use the remaining memory
num_gpu_blocks
=
int
((
free_gpu_memory
*
gpu_memory_utilization
)
//
cache_block_size
)
# num_gpu_blocks = int((total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size)
num_cpu_blocks
=
int
(
cpu_swap_space
//
cache_block_size
)
num_gpu_blocks
=
max
(
num_gpu_blocks
,
0
)
num_cpu_blocks
=
max
(
num_cpu_blocks
,
0
)
if
self
.
model_runner
.
lora_manager
:
self
.
model_runner
.
remove_all_loras
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
# Synchronize number of blocks with all the rank
num_gpu_blocks
=
torch
.
tensor
([
num_gpu_blocks
],
device
=
'cuda'
)
num_cpu_blocks
=
torch
.
tensor
([
num_cpu_blocks
],
device
=
'cuda'
)
torch
.
distributed
.
all_reduce
(
num_gpu_blocks
,
op
=
torch
.
distributed
.
ReduceOp
.
MIN
,
group
=
get_tensor_model_parallel_group
())
torch
.
distributed
.
all_reduce
(
num_cpu_blocks
,
op
=
torch
.
distributed
.
ReduceOp
.
MIN
,
group
=
get_tensor_model_parallel_group
())
num_gpu_blocks
=
num_gpu_blocks
.
item
()
num_cpu_blocks
=
num_cpu_blocks
.
item
()
return
num_gpu_blocks
,
num_cpu_blocks
def
init_cache_engine
(
self
,
cache_config
:
CacheConfig
)
->
None
:
if
self
.
cache_engine
is
None
and
self
.
gpu_cache
is
None
:
self
.
cache_config
=
cache_config
self
.
cache_engine
=
CacheEngine
(
self
.
cache_config
,
self
.
model_config
,
self
.
parallel_config
)
self
.
cache_events
=
self
.
cache_engine
.
events
self
.
gpu_cache
=
self
.
cache_engine
.
gpu_cache
self
.
model_runner
.
set_block_size
(
self
.
cache_engine
.
block_size
)
def
free_cache_engine
(
self
):
# ensure `enforce_eager=True`
self
.
cache_engine
=
None
self
.
gpu_cache
=
None
def
warm_up_model
(
self
)
->
None
:
if
not
self
.
model_config
.
enforce_eager
:
self
.
model_runner
.
capture_model
(
self
.
gpu_cache
)
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed
(
self
.
model_config
.
seed
)
def
cache_swap
(
self
,
blocks_to_swap_in
:
Dict
[
int
,
int
],
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
# Issue cache operations.
issued_cache_op
=
False
if
blocks_to_swap_in
:
self
.
cache_engine
.
swap_in
(
blocks_to_swap_in
)
issued_cache_op
=
True
if
blocks_to_swap_out
:
self
.
cache_engine
.
swap_out
(
blocks_to_swap_out
)
issued_cache_op
=
True
if
blocks_to_copy
:
self
.
cache_engine
.
copy
(
blocks_to_copy
)
issued_cache_op
=
True
cache_events
=
self
.
cache_events
if
issued_cache_op
else
None
# Wait for cache operations to finish.
# TODO(woosuk): Profile swapping overhead and optimize if needed.
if
cache_events
is
not
None
:
for
event
in
cache_events
:
event
.
wait
()
@
torch
.
inference_mode
()
def
execute_model
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
blocks_to_swap_in
:
Dict
[
int
,
int
],
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
SamplerOutput
:
num_seq_groups
=
len
(
seq_group_metadata_list
)
self
.
cache_swap
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
)
# If there is no input, we don't need to execute the model.
if
num_seq_groups
==
0
:
return
{}
output
=
self
.
model_runner
.
execute_model
(
seq_group_metadata_list
,
self
.
gpu_cache
)
return
output
# # Prepare input tensors.
# # NOTE(shengguangming): currently we pad in our dataloader and unpad it in pre_process_input, j
# # we can just input un-padded sequence for better performance
# input_tokens, input_positions, input_metadata = self._prepare_inputs(seq_group_metadata_list)
# # Execute the model.
# output = self.model(
# input_ids=input_tokens,
# positions=input_positions,
# kv_caches=self.gpu_cache,
# input_metadata=input_metadata,
# cache_events=cache_events,
# )
# return output
# assume the input is .state_dict()
def
sync_model_weights
(
self
,
actor_weights
:
Dict
):
load_weights
(
actor_weights
,
self
.
model_runner
.
model
)
def
offload_model_weights
(
self
)
->
None
:
if
self
.
cpu_model
==
None
:
self
.
cpu_model
=
{}
for
name
,
params
in
self
.
model_runner
.
model
.
named_parameters
():
self
.
cpu_model
[
name
]
=
torch
.
empty_like
(
params
,
device
=
'cpu'
)
params
.
data
=
self
.
cpu_model
[
name
]
else
:
for
name
,
params
in
self
.
model_runner
.
model
.
named_parameters
():
params
.
data
=
self
.
cpu_model
[
name
]
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
return
self
.
model_runner
.
add_lora
(
lora_request
)
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
model_runner
.
remove_lora
(
lora_id
)
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
model_runner
.
list_loras
()
def
_init_distributed_environment
(
parallel_config
:
ParallelConfig
,
rank
:
int
,
distributed_init_method
:
Optional
[
str
]
=
None
,
)
->
None
:
"""Initialize the distributed environment."""
if
torch
.
distributed
.
is_initialized
():
print
(
'The distributed environment has been initialized before vLLM'
)
elif
not
distributed_init_method
:
raise
ValueError
(
"distributed_init_method must be set if torch.distributed "
"is not already initialized"
)
else
:
torch
.
distributed
.
init_process_group
(
backend
=
"nccl"
,
world_size
=
parallel_config
.
world_size
,
rank
=
rank
,
# init_method=distributed_init_method,
)
# A small all_reduce for warmup.
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
# TODO (shengguangming): maybe we should also flag the megatron is initialized
if
torch
.
distributed
.
get_world_size
()
>
1
:
initialize_model_parallel_from_megatron
(
tensor_model_parallel_size
=
parallel_config
.
tensor_parallel_size
)
else
:
initialize_model_parallel
()
def
_pad_to_alignment
(
x
:
List
[
int
],
multiple_of
:
int
,
pad
:
int
)
->
List
[
int
]:
return
x
+
[
pad
]
*
((
-
len
(
x
))
%
multiple_of
)
def
_pad_to_max
(
x
:
List
[
int
],
max_len
:
int
,
pad
:
int
)
->
List
[
int
]:
return
x
+
[
pad
]
*
(
max_len
-
len
(
x
))
def
_check_if_gpu_supports_dtype
(
torch_dtype
:
torch
.
dtype
):
# Check if the GPU supports the dtype.
if
torch_dtype
==
torch
.
bfloat16
:
compute_capability
=
torch
.
cuda
.
get_device_capability
()
if
compute_capability
[
0
]
<
8
:
gpu_name
=
torch
.
cuda
.
get_device_name
()
raise
ValueError
(
"Bfloat16 is only supported on GPUs with compute capability "
f
"of at least 8.0. Your
{
gpu_name
}
GPU has compute capability "
f
"
{
compute_capability
[
0
]
}
.
{
compute_capability
[
1
]
}
."
)
verl/third_party/vllm/vllm_v_0_4_2/__init__.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
import
os
import
argparse
import
dataclasses
from
dataclasses
import
dataclass
from
typing
import
List
,
Optional
,
Union
import
torch.nn
as
nn
from
transformers
import
PretrainedConfig
from
.config
import
ModelConfig
,
LoadConfig
from
vllm.config
import
(
CacheConfig
,
DecodingConfig
,
DeviceConfig
,
EngineConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
,
SpeculativeConfig
,
TokenizerPoolConfig
,
VisionLanguageConfig
)
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.utils
import
str_to_int_tuple
def
nullable_str
(
val
:
str
):
if
not
val
or
val
==
"None"
:
return
None
return
val
@
dataclass
class
EngineArgs
:
"""Arguments for vLLM engine."""
model_hf_config
:
PretrainedConfig
=
None
skip_tokenizer_init
:
bool
=
False
served_model_name
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
# TODO
download_dir
:
Optional
[
str
]
=
None
load_format
:
str
=
'auto'
dtype
:
str
=
'auto'
kv_cache_dtype
:
str
=
'auto'
quantization_param_path
:
Optional
[
str
]
=
None
seed
:
int
=
0
max_model_len
:
Optional
[
int
]
=
None
worker_use_ray
:
bool
=
False
pipeline_parallel_size
:
int
=
1
tensor_parallel_size
:
int
=
1
max_parallel_loading_workers
:
Optional
[
int
]
=
None
block_size
:
int
=
16
enable_prefix_caching
:
bool
=
False
use_v2_block_manager
:
bool
=
False
swap_space
:
int
=
4
# GiB
gpu_memory_utilization
:
float
=
0.90
max_num_batched_tokens
:
Optional
[
int
]
=
None
max_num_seqs
:
int
=
256
max_logprobs
:
int
=
5
# OpenAI default value
disable_log_stats
:
bool
=
False
revision
:
Optional
[
str
]
=
None
code_revision
:
Optional
[
str
]
=
None
tokenizer_revision
:
Optional
[
str
]
=
None
quantization
:
Optional
[
str
]
=
None
enforce_eager
:
bool
=
False
max_context_len_to_capture
:
Optional
[
int
]
=
None
max_seq_len_to_capture
:
int
=
8192
disable_custom_all_reduce
:
bool
=
False
tokenizer_pool_size
:
int
=
0
tokenizer_pool_type
:
str
=
"ray"
tokenizer_pool_extra_config
:
Optional
[
dict
]
=
None
enable_lora
:
bool
=
False
max_loras
:
int
=
1
max_lora_rank
:
int
=
16
fully_sharded_loras
:
bool
=
False
lora_extra_vocab_size
:
int
=
256
lora_dtype
=
'auto'
max_cpu_loras
:
Optional
[
int
]
=
None
device
:
str
=
'auto'
ray_workers_use_nsight
:
bool
=
False
num_gpu_blocks_override
:
Optional
[
int
]
=
None
num_lookahead_slots
:
int
=
0
model_loader_extra_config
:
Optional
[
dict
]
=
None
# Related to Vision-language models such as llava
image_input_type
:
Optional
[
str
]
=
None
image_token_id
:
Optional
[
int
]
=
None
image_input_shape
:
Optional
[
str
]
=
None
image_feature_size
:
Optional
[
int
]
=
None
scheduler_delay_factor
:
float
=
0.0
enable_chunked_prefill
:
bool
=
False
guided_decoding_backend
:
str
=
'outlines'
# Speculative decoding configuration.
speculative_model
:
Optional
[
str
]
=
None
num_speculative_tokens
:
Optional
[
int
]
=
None
speculative_max_model_len
:
Optional
[
int
]
=
None
ngram_prompt_lookup_max
:
Optional
[
int
]
=
None
ngram_prompt_lookup_min
:
Optional
[
int
]
=
None
@
staticmethod
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
)
->
argparse
.
ArgumentParser
:
"""Shared CLI arguments for vLLM engine."""
# Model arguments
# TODO(shengguangming): delete the unused args
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
,
help
=
'name or path of the huggingface model to use'
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer
,
help
=
'name or path of the huggingface tokenizer to use'
)
parser
.
add_argument
(
'--revision'
,
type
=
str
,
default
=
None
,
help
=
'the specific model version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--tokenizer-revision'
,
type
=
str
,
default
=
None
,
help
=
'the specific tokenizer version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--tokenizer-mode'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer_mode
,
choices
=
[
'auto'
,
'slow'
],
help
=
'tokenizer mode. "auto" will use the fast '
'tokenizer if available, and "slow" will '
'always use the slow tokenizer.'
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
'--download-dir'
,
type
=
str
,
default
=
EngineArgs
.
download_dir
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of '
'huggingface'
)
parser
.
add_argument
(
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
],
help
=
'The format of the model weights to load. '
'"auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available. '
'"pt" will load the weights in the pytorch bin format. '
'"safetensors" will load the weights in the safetensors format. '
'"npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading. '
'"dummy" will initialize the weights with random values, '
'which is mainly for profiling.'
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
EngineArgs
.
dtype
,
choices
=
[
'auto'
,
'half'
,
'float16'
,
'bfloat16'
,
'float'
,
'float32'
],
help
=
'data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
default
=
None
,
help
=
'model context length. If unspecified, '
'will be automatically derived from the model.'
)
# Parallel arguments
parser
.
add_argument
(
'--worker-use-ray'
,
action
=
'store_true'
,
help
=
'use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU'
)
parser
.
add_argument
(
'--pipeline-parallel-size'
,
'-pp'
,
type
=
int
,
default
=
EngineArgs
.
pipeline_parallel_size
,
help
=
'number of pipeline stages'
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
EngineArgs
.
tensor_parallel_size
,
help
=
'number of tensor parallel replicas'
)
# KV cache arguments
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
],
help
=
'token block size'
)
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
EngineArgs
.
seed
,
help
=
'random seed'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
EngineArgs
.
swap_space
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
EngineArgs
.
gpu_memory_utilization
,
help
=
'the percentage of GPU memory to be used for'
'the model executor'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
EngineArgs
.
max_num_batched_tokens
,
help
=
'maximum number of batched tokens per '
'iteration'
)
parser
.
add_argument
(
'--max-num-seqs'
,
type
=
int
,
default
=
EngineArgs
.
max_num_seqs
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--disable-log-stats'
,
action
=
'store_true'
,
help
=
'disable logging statistics'
)
# Quantization settings.
parser
.
add_argument
(
'--quantization'
,
'-q'
,
type
=
str
,
choices
=
[
'awq'
,
None
],
default
=
None
,
help
=
'Method used to quantize the weights'
)
return
parser
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
)
->
'EngineArgs'
:
# Get the list of attributes of this dataclass.
attrs
=
[
attr
.
name
for
attr
in
dataclasses
.
fields
(
cls
)]
# Set the attributes from the parsed arguments.
engine_args
=
cls
(
**
{
attr
:
getattr
(
args
,
attr
)
for
attr
in
attrs
})
return
engine_args
def
create_engine_config
(
self
,
)
->
EngineConfig
:
device_config
=
DeviceConfig
(
self
.
device
)
# NOTE(sgm): we only modify ModelConfig, other configs are import from vllm
model_config
=
ModelConfig
(
self
.
model_hf_config
,
self
.
dtype
,
self
.
seed
,
self
.
revision
,
self
.
code_revision
,
self
.
tokenizer_revision
,
self
.
max_model_len
,
self
.
quantization
,
self
.
quantization_param_path
,
self
.
enforce_eager
,
self
.
max_context_len_to_capture
,
self
.
max_seq_len_to_capture
,
self
.
max_logprobs
,
self
.
skip_tokenizer_init
,
self
.
served_model_name
)
cache_config
=
CacheConfig
(
self
.
block_size
,
self
.
gpu_memory_utilization
,
self
.
swap_space
,
self
.
kv_cache_dtype
,
self
.
num_gpu_blocks_override
,
model_config
.
get_sliding_window
(),
self
.
enable_prefix_caching
)
parallel_config
=
ParallelConfig
(
self
.
pipeline_parallel_size
,
self
.
tensor_parallel_size
,
self
.
worker_use_ray
,
self
.
max_parallel_loading_workers
,
self
.
disable_custom_all_reduce
,
TokenizerPoolConfig
.
create_config
(
self
.
tokenizer_pool_size
,
self
.
tokenizer_pool_type
,
self
.
tokenizer_pool_extra_config
,
),
self
.
ray_workers_use_nsight
)
# Use the world_size set by TORCHRUN
world_size
=
int
(
os
.
getenv
(
"WORLD_SIZE"
,
"-1"
))
assert
world_size
!=
-
1
,
"The world_size is set to -1, not initialized by TORCHRUN"
parallel_config
.
world_size
=
world_size
# TODO: spec config
speculative_config
=
SpeculativeConfig
.
maybe_create_spec_config
(
target_model_config
=
model_config
,
target_parallel_config
=
parallel_config
,
target_dtype
=
self
.
dtype
,
speculative_model
=
self
.
speculative_model
,
num_speculative_tokens
=
self
.
num_speculative_tokens
,
speculative_max_model_len
=
self
.
speculative_max_model_len
,
enable_chunked_prefill
=
self
.
enable_chunked_prefill
,
use_v2_block_manager
=
self
.
use_v2_block_manager
,
ngram_prompt_lookup_max
=
self
.
ngram_prompt_lookup_max
,
ngram_prompt_lookup_min
=
self
.
ngram_prompt_lookup_min
,
)
scheduler_config
=
SchedulerConfig
(
self
.
max_num_batched_tokens
,
self
.
max_num_seqs
,
model_config
.
max_model_len
,
self
.
use_v2_block_manager
,
num_lookahead_slots
=
(
self
.
num_lookahead_slots
if
speculative_config
is
None
else
speculative_config
.
num_lookahead_slots
),
delay_factor
=
self
.
scheduler_delay_factor
,
enable_chunked_prefill
=
self
.
enable_chunked_prefill
,
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
self
.
max_lora_rank
,
max_loras
=
self
.
max_loras
,
fully_sharded_loras
=
self
.
fully_sharded_loras
,
lora_extra_vocab_size
=
self
.
lora_extra_vocab_size
,
lora_dtype
=
self
.
lora_dtype
,
max_cpu_loras
=
self
.
max_cpu_loras
if
self
.
max_cpu_loras
and
self
.
max_cpu_loras
>
0
else
None
)
if
self
.
enable_lora
else
None
load_config
=
LoadConfig
(
load_format
=
self
.
load_format
,
download_dir
=
self
.
download_dir
,
model_loader_extra_config
=
self
.
model_loader_extra_config
,
)
if
self
.
image_input_type
:
if
(
not
self
.
image_token_id
or
not
self
.
image_input_shape
or
not
self
.
image_feature_size
):
raise
ValueError
(
'Specify `image_token_id`, `image_input_shape` and '
'`image_feature_size` together with `image_input_type`.'
)
vision_language_config
=
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
get_image_input_enum_type
(
self
.
image_input_type
),
image_token_id
=
self
.
image_token_id
,
image_input_shape
=
str_to_int_tuple
(
self
.
image_input_shape
),
image_feature_size
=
self
.
image_feature_size
,
)
else
:
vision_language_config
=
None
decoding_config
=
DecodingConfig
(
guided_decoding_backend
=
self
.
guided_decoding_backend
)
return
EngineConfig
(
model_config
=
model_config
,
cache_config
=
cache_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
vision_language_config
=
vision_language_config
,
speculative_config
=
speculative_config
,
load_config
=
load_config
,
decoding_config
=
decoding_config
)
verl/third_party/vllm/vllm_v_0_4_2/config.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
import
enum
import
json
from
typing
import
List
,
Optional
,
Union
from
dataclasses
import
dataclass
,
field
,
fields
from
transformers
import
PretrainedConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
get_quantization_config
from
vllm.transformers_utils.config
import
get_hf_text_config
from
vllm.utils
import
is_hip
# Add for verl
from
vllm.config
import
ModelConfig
,
_get_and_verify_dtype
,
_get_and_verify_max_len
GPTQMarlinConfig
=
get_quantization_config
(
"gptq_marlin"
)
logger
=
init_logger
(
__name__
)
_GB
=
1
<<
30
class
ModelConfig
(
ModelConfig
):
"""Configuration for the model.
Args:
model: Name or path of the huggingface model to use.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
dtype: Data type for model weights and activations. The "auto" option
will use FP16 precision for FP32 and FP16 models, and BF16 precision
for BF16 models.
seed: Random seed for reproducibility.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id. If unspecified, will use the default
version.
code_revision: The specific revision to use for the model code on
Hugging Face Hub. It can be a branch name, a tag name, or a
commit id. If unspecified, will use the default version.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id. If unspecified, will use
the default version.
max_model_len: Maximum length of a sequence (including prompt and
output). If None, will be derived from the model.
quantization: Quantization method that was used to quantize the model
weights. If None, we assume the model weights are not quantized.
quantization_param_path: Path to JSON file containing scaling factors.
Used to load KV cache scaling factors into the model when KV cache
type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
be used to load activation and weight scaling factors when the
model dtype is FP8_E4M3 on ROCm.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode
skip_tokenizer_init: If true, skip initialization of tokenizer and
detokenizer.
served_model_name: The model name used in metrics tag `model_name`,
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
"""
def
__init__
(
self
,
hf_config
:
PretrainedConfig
,
dtype
:
str
,
seed
:
int
,
revision
:
Optional
[
str
]
=
None
,
code_revision
:
Optional
[
str
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
max_model_len
:
Optional
[
int
]
=
None
,
quantization
:
Optional
[
str
]
=
None
,
quantization_param_path
:
Optional
[
str
]
=
None
,
enforce_eager
:
bool
=
False
,
max_context_len_to_capture
:
Optional
[
int
]
=
None
,
max_seq_len_to_capture
:
Optional
[
int
]
=
None
,
max_logprobs
:
int
=
5
,
skip_tokenizer_init
:
bool
=
False
,
served_model_name
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
)
->
None
:
self
.
model
=
hf_config
.
_name_or_path
self
.
tokenizer
=
hf_config
.
_name_or_path
self
.
seed
=
seed
self
.
revision
=
revision
self
.
code_revision
=
code_revision
self
.
tokenizer_revision
=
tokenizer_revision
self
.
quantization
=
quantization
self
.
quantization_param_path
=
quantization_param_path
self
.
enforce_eager
=
enforce_eager
self
.
max_context_len_to_capture
=
max_context_len_to_capture
if
self
.
max_context_len_to_capture
is
not
None
:
raise
ValueError
(
"`max_context_len_to_capture` is deprecated. "
"Use `max_seq_len_to_capture` instead."
)
self
.
max_seq_len_to_capture
=
(
max_seq_len_to_capture
or
max_context_len_to_capture
)
self
.
max_logprobs
=
max_logprobs
self
.
skip_tokenizer_init
=
skip_tokenizer_init
# self.hf_config = get_config(model, trust_remote_code, revision)
self
.
hf_config
=
hf_config
self
.
hf_text_config
=
get_hf_text_config
(
hf_config
)
# TODO: for multimodal model
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_config
,
dtype
)
self
.
max_model_len
=
_get_and_verify_max_len
(
self
.
hf_config
,
max_model_len
)
# self.served_model_name = get_served_model_name(model,
# served_model_name)
# self._verify_load_format()
# self._verify_tokenizer_mode()
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
class
LoadFormat
(
str
,
enum
.
Enum
):
AUTO
=
'auto'
MEGATRON
=
"megatron"
HF
=
"hf"
DTENSOR
=
'dtensor'
DUMMY_HF
=
'dummy_hf'
DUMMY_MEGATRON
=
'dummy_megatron'
DUMMY_DTENSOR
=
'dummy_dtensor'
@
dataclass
class
LoadConfig
:
"""
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
"tensorizer" will use CoreWeave's tensorizer library for
fast weight loading.
"""
load_format
:
Union
[
str
,
LoadFormat
,
"BaseModelLoader"
]
=
LoadFormat
.
AUTO
download_dir
:
Optional
[
str
]
=
None
model_loader_extra_config
:
Optional
[
Union
[
str
,
dict
]]
=
field
(
default_factory
=
dict
)
def
__post_init__
(
self
):
model_loader_extra_config
=
self
.
model_loader_extra_config
or
{}
if
isinstance
(
model_loader_extra_config
,
str
):
self
.
model_loader_extra_config
=
json
.
loads
(
model_loader_extra_config
)
self
.
_verify_load_format
()
def
_verify_load_format
(
self
)
->
None
:
if
not
isinstance
(
self
.
load_format
,
str
):
return
load_format
=
self
.
load_format
.
lower
()
self
.
load_format
=
LoadFormat
(
load_format
)
rocm_not_supported_load_format
:
List
[
str
]
=
[]
if
is_hip
()
and
load_format
in
rocm_not_supported_load_format
:
rocm_supported_load_format
=
[
f
for
f
in
LoadFormat
.
__members__
if
(
f
not
in
rocm_not_supported_load_format
)
]
raise
ValueError
(
f
"load format '
{
load_format
}
' is not supported in ROCm. "
f
"Supported load formats are "
f
"
{
rocm_supported_load_format
}
"
)
Prev
1
…
11
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment