"test/vscode:/vscode.git/clone" did not exist on "590f2da052058dfa902b291d1b2be5d27f8f231b"
Commit f87b35b2 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2648 failed with stages
in 0 seconds
# Detached Worker
## How to run (Only on a single node)
- Start a local ray cluster:
```bash
ray start --head --port=6379
```
- Run the server
```bash
python3 server.py
```
- On another terminal, Run the client
```bash
python3 client.py
```
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
In client, we can get the server handler and send RPC request
"""
import ray
import torch
from verl import DataProto
from verl.single_controller.ray import RayClassWithInitArgs
from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
from tensordict import TensorDict
from server import Trainer
def compute_position_id_with_mask(mask):
return torch.clip(torch.cumsum(mask, dim=-1) - 1, min=0, max=None)
if __name__ == '__main__':
ray.init(address='auto', namespace='verl')
# get the worker group using names
worker_names = ['trainerTrainer_0:0', 'trainerTrainer_0:1']
cls_with_init_args = RayClassWithInitArgs(cls=Trainer)
worker_group = NVMegatronRayWorkerGroup.from_detached(worker_names=worker_names,
ray_cls_with_init=cls_with_init_args)
batch_size = 16
sequence_length = 1024
# give Trainer some data to train
input_ids = torch.randint(low=0, high=256, size=(batch_size, sequence_length), dtype=torch.int64, device='cuda')
attention_mask = torch.ones_like(input_ids)
position_ids = compute_position_id_with_mask(attention_mask)
data = DataProto(batch=TensorDict(
{
'input_ids': input_ids,
'attention_mask': attention_mask,
'position_ids': position_ids
}, batch_size=batch_size),
meta_info={})
output = worker_group.train_model(data)
print(output)
#!/bin/bash
ray start --head --port=6379
python3 server.py
python3 client.py
ray stop --force
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Server starts a Trainer. Client sends data to the server to train.
"""
import os
os.environ['MEGATRON_USE_CUDA_TIMER'] = '0'
os.environ['MEGATRON_START_PROCESS_TIMER'] = 'False'
os.environ['NCCL_DEBUG'] = 'WARN'
import torch
from torch import nn
import ray
from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool
from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
from verl.single_controller.base.megatron.worker import MegatronWorker
from verl.single_controller.base.decorator import register, Dispatch
from verl import DataProto
from verl.models.llama.megatron import ParallelLlamaForCausalLMRmPadPP
from megatron.core import parallel_state as mpu
from megatron.core.models.gpt.gpt_model import ModelType
from megatron.core import tensor_parallel
from verl.utils.megatron_utils import get_model, init_megatron_optim_config, mcore_model_parallel_config
from verl.utils.megatron.optimizer import get_megatron_optimizer
from transformers import LlamaConfig
from omegaconf import OmegaConf
from tensordict import TensorDict
@ray.remote
class Trainer(MegatronWorker):
def __init__(self):
super().__init__()
if not torch.distributed.is_initialized():
rank = int(os.environ['LOCAL_RANK'])
torch.distributed.init_process_group(backend="nccl")
torch.cuda.set_device(rank)
os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
mpu.initialize_model_parallel(
tensor_model_parallel_size=2,
pipeline_model_parallel_size=1,
virtual_pipeline_model_parallel_size=None,
pipeline_model_parallel_split_rank=None,
use_sharp=False,
context_parallel_size=1,
expert_model_parallel_size=1,
nccl_communicator_config_path=None,
)
tensor_parallel.model_parallel_cuda_manual_seed(10)
@register(dispatch_mode=Dispatch.ONE_TO_ALL)
def init_model(self):
actor_model_config = LlamaConfig(vocab_size=256,
hidden_size=2048,
intermediate_size=5504,
num_hidden_layers=24,
num_attention_heads=16,
num_key_value_heads=16)
megatron_config = mcore_model_parallel_config(sequence_parallel=True, params_dtype=torch.bfloat16)
self.megatron_config = megatron_config
def megatron_actor_model_provider(pre_process, post_process):
# vpp is not supported yet because it will hang for some reason. Need debugging
vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model
# this_megatron_config = copy.deepcopy(megatron_config)
# this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
parallel_model = ParallelLlamaForCausalLMRmPadPP(config=actor_model_config,
megatron_config=megatron_config,
pre_process=pre_process,
post_process=post_process)
parallel_model.cuda()
return parallel_model
actor_module = get_model(model_provider_func=megatron_actor_model_provider,
model_type=ModelType.encoder_or_decoder,
wrap_with_ddp=True)
actor_module = nn.ModuleList(actor_module)
optim_config = OmegaConf.create({'lr': 1e-6, 'clip_grad': 1.0})
optim_config = init_megatron_optim_config(optim_config)
self.optimizer_config = optim_config
actor_optimizer = get_megatron_optimizer(model=actor_module, config=optim_config)
self.model = actor_module[0]
self.optimizer = actor_optimizer
@register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
def train_model(self, data: DataProto) -> DataProto:
input_ids = data.batch['input_ids']
attention_mask = data.batch['attention_mask']
position_ids = data.batch['position_ids']
self.optimizer.zero_grad()
self.model.zero_grad_buffer(
zero_buffer=(not self.optimizer_config.use_distributed_optimizer
)) # use use_contiguous_buffers_in_local_ddp and no overlap_dp_param_comm
# update for 1 iteration
output = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids).logits
output.mean().backward()
update_successful, grad_norm, num_zeros_in_grad = self.optimizer.step(self.megatron_config,
self.megatron_config.timers)
return DataProto(batch=TensorDict({'loss': output.detach()}, batch_size=output.shape[0]))
if __name__ == '__main__':
ray.init(address='auto', namespace='verl')
resource_pool = RayResourcePool(process_on_nodes=[2], detached=True)
cls_with_init_args = RayClassWithInitArgs(cls=Trainer)
worker_group = NVMegatronRayWorkerGroup(
resource_pool=resource_pool,
ray_cls_with_init=cls_with_init_args,
name_prefix='trainer',
detached=True,
)
worker_group.init_model()
worker_names = worker_group.worker_names
print(worker_names)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import os
import subprocess
def test():
wait_time = 10
my_env = os.environ.copy()
my_env["WAIT_TIME"] = str(wait_time)
p = subprocess.Popen(["python3", "-u", "./check_worker_alive/main.py"], env=my_env, stdout=subprocess.PIPE)
count = 0
while b"foo started" not in p.stdout.read():
time.sleep(1)
count += 1
if count > 40:
raise RuntimeError("timeout for start foo in check_worker_alive/main.py")
print(
time.time(),
f"wait 1.5 wait time {wait_time*1.5} to let signal returned to process but still not exceed process wait time")
time.sleep(wait_time * 1.5)
print(time.time(), f"start checking")
assert p.poll() is not None, f"process {p} still alive, expecting signal raised abort"
assert p.returncode != 0, f"process {p} exit with code 0, expecting not-zero exit code"
print(f"test passed")
if __name__ == "__main__":
test()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ray
from verl.single_controller.base import Worker
from verl.single_controller.base.decorator import register, Dispatch
from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, create_colocated_worker_cls
from verl import DataProto
@ray.remote
class Actor(Worker):
def __init__(self) -> None:
super().__init__()
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def add(self, data: DataProto):
data.batch['a'] += self.rank
return data
@ray.remote
class Critic(Worker):
def __init__(self, config) -> None:
super().__init__()
self.config = config
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def sub(self, data: DataProto):
data.batch['a'] -= self.config['b']
return data
def test_colocated_workers():
ray.init()
import torch
data = DataProto.from_dict({'a': torch.zeros(10)})
# create separate workers on the same resource pool
actor_cls = RayClassWithInitArgs(cls=Actor)
critic_cls = RayClassWithInitArgs(cls=Critic, config={'b': 10})
resource_pool = RayResourcePool(process_on_nodes=[2])
actor_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=actor_cls)
critic_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=critic_cls)
expected_actor_output = actor_wg.add(data)
expected_critic_output = critic_wg.sub(data)
# create colocated workers
cls_dict = {'actor': actor_cls, 'critic': critic_cls}
ray_cls_with_init = create_colocated_worker_cls(cls_dict)
wg_dict = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
spawn_wg = wg_dict.spawn(prefix_set=cls_dict.keys())
colocated_actor_wg = spawn_wg['actor']
colocated_critic_wg = spawn_wg['critic']
actor_output = colocated_actor_wg.add(data)
critic_output = colocated_critic_wg.sub(data)
torch.testing.assert_close(expected_actor_output.batch, actor_output.batch, atol=0, rtol=0)
torch.testing.assert_close(expected_critic_output.batch, critic_output.batch, atol=0, rtol=0)
ray.shutdown()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
In this test, we instantiate a data parallel worker with 8 GPUs
"""
from verl.single_controller.base import Worker
from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool
from verl.single_controller.base.decorator import Dispatch, register
import ray
import torch
from torch import distributed as dist
from verl import DataProto
from verl.utils.ray_utils import parallel_put
from codetiming import Timer
import tensordict
@ray.remote
class DummyWorker(Worker):
def __init__(self):
super().__init__()
dist.init_process_group()
@register(dispatch_mode=Dispatch.DP_COMPUTE, blocking=False)
def do_nothing(self, data):
for key in data.batch.keys():
data.batch[key] += 1
if tensordict.__version__ >= '0.5.0':
data.batch = data.batch.consolidate()
return data
def test_data_transfer():
ray.init()
# construct resource pool
resource_pool = RayResourcePool([8])
cls_with_init = RayClassWithInitArgs(cls=DummyWorker)
# construct worker group
wg = RayWorkerGroup(resource_pool, cls_with_init)
# this is real dataset size
batch_size = 4096
seqlen = 32768
data_dict = {}
for i in range(2):
data_dict[str(i)] = torch.randint(0, 10000, (batch_size, seqlen))
data = DataProto.from_dict(tensors=data_dict)
print(data)
# we manually split data here and send to each worker
data_list = data.chunk(wg.world_size)
for i in range(wg.world_size):
# consolidate is necessary
if tensordict.__version__ >= '0.5.0':
data_list[i].batch = data_list[i].batch.consolidate()
with Timer(name='ray.pickle', initial_text=True):
for i in range(wg.world_size):
ray.cloudpickle.pickle.dumps(data_list[i])
with Timer(name='raw.pickle', initial_text=True):
import pickle
for i in range(wg.world_size):
pickle.dumps(data_list[i])
# we put in advance
with Timer(name='put', initial_text=True):
# takes around 40 seconds
data_list_ref = parallel_put(data_list)
# for i in range(wg.world_size):
# data_list[i] = ray.put(data_list[i])
with Timer(name='launch', initial_text=True):
output_ref = wg.do_nothing(data_list_ref)
with Timer(name='get', initial_text=True):
# takes around 40 seconds
output_lst = ray.get(output_ref)
for input_data, output_data in zip(data_list, output_lst):
for key in input_data.batch.keys():
assert torch.all(torch.eq(input_data.batch[key] + 1,
output_data.batch[key])), (input_data.batch[key], output_data.batch[key], key)
ray.shutdown()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import ray
import torch
from verl import DataProto
from tensordict import TensorDict
from verl.single_controller.base.worker import Worker
from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs
from verl.single_controller.ray import RayWorkerGroup
os.environ['RAY_DEDUP_LOGS'] = '0'
os.environ['NCCL_DEBUG'] = 'WARN'
@ray.remote
class ModelActor(Worker):
def __init__(self):
pass
class HackSelf():
def __init__(self):
pass
def get_aux_metrics(self, test_proto):
sequence_ids = test_proto.batch["sequence_ids"]
decode_count = []
for i in range(sequence_ids.size(0)):
decode_count.append(len(sequence_ids[i].tolist()))
ret_proto = DataProto(batch=TensorDict({
"sequence_ids": sequence_ids,
"decode_count": torch.tensor(decode_count)
},
batch_size=sequence_ids.size(0)))
return ret_proto
def test():
# construct model
ray.init()
# create 2 workers, each hold a GPU
resource_pool = RayResourcePool([2], use_gpu=True, name_prefix='a')
class_with_args = RayClassWithInitArgs(cls=ModelActor)
shard_wg = RayWorkerGroup(resource_pool, class_with_args)
test_bs = 8
test_proto = DataProto(TensorDict({
"sequence_ids": torch.ones([test_bs, 2048], dtype=torch.int64),
},
batch_size=test_bs),
meta_info={"query_length": 1536})
# Sharding among different ranks
ret_proto1 = shard_wg.execute_with_func_generator(get_aux_metrics, test_proto)
# compare execute on driver
hs = HackSelf()
ret_proto2 = get_aux_metrics(hs, test_proto)
torch.testing.assert_close(ret_proto1.batch["decode_count"], ret_proto2.batch["decode_count"])
ray.shutdown()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import ray
from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, merge_resource_pool
from verl.single_controller.base.worker import Worker
@ray.remote
class TestActor(Worker):
# TODO: pass *args and **kwargs is bug prone and not very convincing
def __init__(self, cuda_visible_devices=None) -> None:
super().__init__(cuda_visible_devices)
def get_node_id(self):
return ray.get_runtime_context().get_node_id()
def test():
ray.init()
# test single-node-no-partition
print(f"test single-node-no-partition")
resource_pool = RayResourcePool([8], use_gpu=True)
class_with_args = RayClassWithInitArgs(cls=TestActor)
print("create actor worker group")
actor_wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="high_level_api_actor")
print("create critic worker group")
critic_wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="hight_level_api_critic")
print("create rm worker group")
rm_wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="high_level_api_rm")
print("create ref worker group")
ref_wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="high_level_api_ref")
assert actor_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
assert critic_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
assert rm_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
assert ref_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
del actor_wg
del critic_wg
del rm_wg
del ref_wg
[ray.util.remove_placement_group(pg) for pg in resource_pool.get_placement_groups()]
print("wait 5s to remove placemeng_group")
time.sleep(5)
# test single-node-multi-partition
print(f"test single-node-multi-partition")
rm_resource_pool = RayResourcePool([4], use_gpu=True, name_prefix="rm")
ref_resource_pool = RayResourcePool([4], use_gpu=True, name_prefix="ref")
total_resource_pool = merge_resource_pool(rm_resource_pool, ref_resource_pool)
assert rm_resource_pool.world_size == 4
assert ref_resource_pool.world_size == 4
assert total_resource_pool.world_size == 8
actor_wg = RayWorkerGroup(total_resource_pool, class_with_args, name_prefix="high_level_api_actor")
critic_wg = RayWorkerGroup(total_resource_pool, class_with_args, name_prefix="high_level_api_critic")
rm_wg = RayWorkerGroup(rm_resource_pool, class_with_args, name_prefix="high_level_api_rm")
ref_wg = RayWorkerGroup(ref_resource_pool, class_with_args, name_prefix="high_level_api_ref")
assert actor_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
assert critic_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
assert rm_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(4)]
assert ref_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(4, 8)]
ray.shutdown()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
e2e test verl.single_controller.ray
"""
import os
import ray
from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup
from verl.single_controller.base.worker import Worker
from verl.single_controller.base.decorator import register, Dispatch, collect_all_to_all, Execute
@ray.remote
class TestActor(Worker):
def __init__(self) -> None:
super().__init__()
def getenv(self, key):
val = os.getenv(key, f"{key} not set")
return val
def test_basics():
ray.init()
# create 4 workers, each hold a GPU
resource_pool = RayResourcePool([4], use_gpu=True)
class_with_args = RayClassWithInitArgs(cls=TestActor)
worker_group = RayWorkerGroup(resource_pool=resource_pool,
ray_cls_with_init=class_with_args,
name_prefix="worker_group_basic")
output = worker_group.execute_all_sync("getenv", key="RAY_LOCAL_WORLD_SIZE")
assert output == ["4", "4", "4", "4"]
output = worker_group.execute_all_sync("getenv", key="RAY_LOCAL_RANK")
assert set(output) == set(["0", "1", "2", "3"])
ray.shutdown()
if __name__ == '__main__':
test_basics()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ray
@ray.remote
class TestWorker:
def __init__(self, rank, world_size, group_name):
self.rank = rank
self.world_size = world_size
self.group_name = group_name
self.communicator = None
def init(self):
from verl.utils.rendezvous.ray_backend import create_nccl_communicator_in_ray
self.communicator = create_nccl_communicator_in_ray(self.rank, self.world_size, self.group_name)
def test(self):
if self.communicator is None:
return None
return self.communicator.rank_id()
def test_rvdz():
ray.init()
group_name = "test_group"
world_size = 2
workers = [TestWorker.options(num_gpus=1).remote(rank, world_size, group_name) for rank in range(world_size)]
ray.get([worker.init.remote() for worker in workers])
ranks = ray.get([worker.test.remote() for worker in workers])
assert ranks == [0, 1], f"expecting [0, 1], got {ranks}"
ray.shutdown()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
e2e test verl.single_controller.ray
"""
import torch
import ray
from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup
from verl.single_controller.base.worker import Worker
from verl.single_controller.base.decorator import register, Dispatch, collect_all_to_all, Execute
def two_to_all_dispatch_fn(worker_group, *args, **kwargs):
"""
Assume the input is a list of 2. Duplicate the input interleaved and pass to each worker.
"""
for arg in args:
assert len(arg) == 2
for i in range(worker_group.world_size - 2):
arg.append(arg[i % 2])
for k, v in kwargs.items():
assert len(v) == 2
for i in range(worker_group.world_size - 2):
v.append(v[i % 2])
return args, kwargs
@ray.remote
class TestActor(Worker):
# TODO: pass *args and **kwargs is bug prone and not very convincing
def __init__(self, x) -> None:
super().__init__()
self._x = x
def foo(self, y):
return self._x + y
@register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)
def foo_rank_zero(self, x, y):
return self._x + y + x
@register(Dispatch.ONE_TO_ALL, blocking=False)
def foo_one_to_all(self, x, y):
return self._x + y + x
@register(Dispatch.ALL_TO_ALL, blocking=False)
def foo_all_to_all(self, x, y):
return self._x + y + x
@register(dispatch_mode={'dispatch_fn': two_to_all_dispatch_fn, 'collect_fn': collect_all_to_all})
def foo_custom(self, x, y):
return self._x + y + x
@ray.remote(num_gpus=0.1)
def remote_call_wg(worker_names):
class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)
worker_group = RayWorkerGroup.from_detached(worker_names=worker_names, ray_cls_with_init=class_with_args)
print(worker_group.worker_names)
output_ref = worker_group.foo_custom(x=[1, 2], y=[5, 6])
assert output_ref == [8, 10, 8, 10]
output_ref = worker_group.foo_rank_zero(x=1, y=2)
assert output_ref == 5
return worker_group.worker_names
def add_one(data):
data = data.to("cuda")
data += 1
data = data.to("cpu")
return data
def test_basics():
ray.init()
# create 4 workers, each hold a GPU
resource_pool = RayResourcePool([4], use_gpu=True)
class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)
worker_group = RayWorkerGroup(resource_pool=resource_pool,
ray_cls_with_init=class_with_args,
name_prefix="worker_group_basic")
print(worker_group.worker_names)
# this will wait for all the results
output = worker_group.execute_all_sync("foo", y=3)
assert output == [5, 5, 5, 5]
# this is a list of object reference. It won't block.
output_ref = worker_group.execute_all_async("foo", y=4)
print(output_ref)
assert ray.get(output_ref) == [6, 6, 6, 6]
output_ref = worker_group.foo_one_to_all(x=1, y=2)
assert ray.get(output_ref) == [5, 5, 5, 5]
output_ref = worker_group.foo_all_to_all(x=[1, 2, 3, 4], y=[5, 6, 7, 8])
assert ray.get(output_ref) == [8, 10, 12, 14]
print(ray.get(remote_call_wg.remote(worker_group.worker_names)))
output = worker_group.execute_func_rank_zero(add_one, torch.ones(2, 2))
torch.testing.assert_close(output, torch.ones(2, 2) + 1)
ray.shutdown()
if __name__ == '__main__':
test_basics()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
os.environ['RAY_DEDUP_LOGS'] = '0'
os.environ['NCCL_DEBUG'] = 'WARN'
import torch
import torch.distributed
import ray
from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup
from verl.single_controller.base.worker import Worker
@ray.remote
class TestAllGatherActor(Worker):
def __init__(self, size) -> None:
super().__init__()
self.size = size
def init(self):
torch.distributed.init_process_group()
self.tensor = torch.zeros(size=(self.size,), dtype=torch.int64, device='cuda')
self.tensor += self.rank
def all_gather(self):
world_size = self._world_size
output = torch.zeros(size=(self.tensor.shape[0] * world_size,),
dtype=self.tensor.dtype,
device=self.tensor.device)
torch.distributed.all_gather_into_tensor(output, self.tensor, async_op=False)
return output
@ray.remote
class TestAllGatherActorV2(Worker):
def __init__(self, size) -> None:
super().__init__()
self.size = size
torch.distributed.init_process_group()
self.tensor = torch.zeros(size=(self.size,), dtype=torch.int64, device='cuda')
self.tensor += self.rank
def all_gather(self):
world_size = self._world_size
output = torch.zeros(size=(self.tensor.shape[0] * world_size,),
dtype=self.tensor.dtype,
device=self.tensor.device)
torch.distributed.all_gather_into_tensor(output, self.tensor, async_op=False)
return output
def test_all_gather_torch():
"""
In this test, we instantiate 4 GPUs in a group and test the all_gather
"""
ray.init()
# create 4 workers, each hold a GPU
resource_pool = RayResourcePool([4], use_gpu=True)
class_with_args = RayClassWithInitArgs(cls=TestAllGatherActor, size=2)
worker_group = RayWorkerGroup(resource_pool, class_with_args, name_prefix="worker_group_torch")
worker_group.execute_all_sync('init')
output = worker_group.execute_all_sync('all_gather')
for i in range(1, len(output)):
assert torch.all(output[i] == output[0])
output = output[0].cpu()
print(output)
assert torch.all(output == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3], dtype=torch.int64))
ray.shutdown()
def test_all_gather_torch_v2():
"""
In this test, we instantiate 4 GPUs in a group and test the all_gather
"""
ray.init()
# create 4 workers, each hold a GPU
resource_pool = RayResourcePool([4], use_gpu=True)
class_with_args = RayClassWithInitArgs(cls=TestAllGatherActorV2, size=2)
worker_group = RayWorkerGroup(resource_pool, class_with_args, name_prefix="worker_group_torch")
output = worker_group.execute_all_sync('all_gather')
for i in range(1, len(output)):
assert torch.all(output[i] == output[0])
output = output[0].cpu()
print(output)
assert torch.all(output == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3], dtype=torch.int64))
ray.shutdown()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision, CPUOffload
from torch.distributed.fsdp.api import ShardingStrategy, ShardedStateDictConfig, StateDictType
import torch
from verl.utils.distributed import initialize_global_process_group
from verl.third_party.vllm import LLM
from vllm import SamplingParams
import time
import torch.distributed as dist
def main():
assert torch.cuda.is_available(), 'CUDA must be present to run FSDP vLLM example'
local_rank, rank, world_size = initialize_global_process_group()
local_cache_path = '~/.cache/verl/rlhf'
local_cache_path = os.path.expanduser(local_cache_path)
hdfs_path = 'Qwen/Qwen2-7B-Instruct'
from verl.utils.fs import copy_to_local
local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
actor_model_config = AutoConfig.from_pretrained(local_model_path, trust_remote_code=True)
with torch.device("cuda"):
actor_model = AutoModelForCausalLM.from_pretrained(local_model_path, trust_remote_code=True)
actor_model.to(torch.bfloat16)
max_prompt_length = 16
response_length = 32
preencode_prompts = [
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
tokenizer.pad_token = tokenizer.eos_token
prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
input_ids = prompts['input_ids']
attention_mask = prompts['attention_mask']
from verl.utils.torch_functional import pad_sequence_to_length
input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True).cuda()
attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True).cuda()
from transformers import GenerationConfig
generation_config = GenerationConfig(do_sample=False)
actor_model.cuda()
output = actor_model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=32,
# max_length=max_length,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
generation_config=generation_config,
# renormalize_logits=True,
output_scores=False, # this is potentially very large
return_dict_in_generate=True,
use_cache=False) # may OOM when use_cache = True
seq = output.sequences
response = seq[:, max_prompt_length:]
print(f'hf response: {tokenizer.batch_decode(response)}')
tensor_model_parallel_size = 4
from torch.distributed.device_mesh import init_device_mesh
device_mesh = init_device_mesh('cuda', mesh_shape=(world_size,), mesh_dim_names=['fsdp'])
mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
fsdp_model = FSDP(actor_model,
use_orig_params=True,
auto_wrap_policy=None,
device_id=torch.cuda.current_device(),
sharding_strategy=ShardingStrategy.FULL_SHARD,
mixed_precision=mixed_precision,
cpu_offload=CPUOffload(offload_params=False),
sync_module_states=False,
device_mesh=device_mesh)
FSDP.set_state_dict_type(fsdp_model,
state_dict_type=StateDictType.SHARDED_STATE_DICT,
state_dict_config=ShardedStateDictConfig())
state_dict = fsdp_model.state_dict()
sampling_params = SamplingParams(temperature=0,
top_p=1,
n=1,
max_tokens=response_length,
logprobs=1,
ignore_eos=True,
detokenize=False)
print(actor_model_config)
llm = LLM(model=None,
tokenizer=tokenizer,
model_hf_config=actor_model_config,
tensor_parallel_size=tensor_model_parallel_size,
enforce_eager=True,
dtype='bfloat16',
load_format='dummy_dtensor',
gpu_memory_utilization=0.8,
trust_remote_code=True)
# Warmup iterations
for _ in range(10):
torch.cuda.synchronize()
llm.sync_model_weights(actor_weights=state_dict, load_format='dtensor')
torch.cuda.synchronize()
dist.barrier()
start_time = time.time()
llm.sync_model_weights(actor_weights=state_dict, load_format='dtensor')
torch.cuda.synchronize()
dist.barrier()
end_time = time.time()
# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.6f} seconds")
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
idx_list = []
batch_size = input_ids.shape[0]
pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
from verl.workers.rollout.vllm_rollout.vllm_rollout import _pre_process_inputs
for i in range(batch_size):
idx_list.append(_pre_process_inputs(pad_token_id, input_ids[i]))
print('start generation')
outputs = llm.generate(prompt_token_ids=idx_list, sampling_params=sampling_params, use_tqdm=False)
vllm_output = outputs[0].cuda()
if torch.distributed.get_rank() == 0:
print(f'hf response: {tokenizer.batch_decode(response)}')
print(f'vllm response: {tokenizer.batch_decode(vllm_output)}')
if __name__ == "__main__":
main()
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
from torch.distributed.device_mesh import init_device_mesh
from sglang.srt.entrypoints.verl_engine import VerlEngine
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import GenerationConfig
from verl.utils.torch_functional import pad_sequence_to_length
def levenshtein(s1, s2):
m, n = len(s1), len(s2)
# Initialize matrix of zeros
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Initialize first column and first row of the matrix
for i in range(m + 1):
dp[i][0] = i # Deletion from s1 to empty string
for j in range(n + 1):
dp[0][j] = j # Insertion to s1 from empty string
# Compute the Levenshtein distance matrix
for i in range(1, m + 1):
for j in range(1, n + 1):
cost = 0 if s1[i - 1] == s2[j - 1] else 1 # No cost if characters match
dp[i][j] = min(
dp[i - 1][j] + 1, # Deletion
dp[i][j - 1] + 1, # Insertion
dp[i - 1][j - 1] + cost # Substitution
)
return dp[m][n]
def are_lists_similar(a, b):
if len(a) != len(b):
print("The lists are of different lengths.")
return False
total_length = 0
total_diff = 0
for s1, s2 in zip(a, b):
max_len = max(len(s1), len(s2))
total_length += max_len
diff = levenshtein(s1, s2)
total_diff += diff
print(f"Comparing strings:\n{s1}\n{s2}\nDifference: {diff} characters\n")
percentage_difference = (total_diff / total_length) * 100
print(f"Total difference: {percentage_difference:.2f}%")
return percentage_difference <= 10
def initialize_global_process_group(timeout_second=36000):
from datetime import timedelta
import torch.distributed
# NOTE MODIFIED should provide backend=None to have nccl+gloo
# torch.distributed.init_process_group('nccl', timeout=timedelta(seconds=timeout_second))
torch.distributed.init_process_group(timeout=timedelta(seconds=timeout_second))
local_rank = int(os.environ["LOCAL_RANK"])
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
if torch.distributed.is_initialized():
torch.cuda.set_device(local_rank)
return local_rank, rank, world_size
def test_sglang_spmd():
assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'
initialize_global_process_group()
# fill rollout config
max_prompt_length = 16
max_response_length = 16
# Initialize model and token
local_cache_path = '~/.cache/verl/rlhf'
local_cache_path = os.path.expanduser(local_cache_path)
hdfs_path = 'Qwen/Qwen2-7B-Instruct'
from verl.utils.fs import copy_to_local
local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
tokenizer = AutoTokenizer.from_pretrained(local_model_path, padding_side='left')
preencode_prompts = [
"Who won the Champions League in 2019?",
"The founder of Apple is",
"What's your name",
]
tokenizer.pad_token = tokenizer.eos_token
prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
input_ids = prompts['input_ids']
attention_mask = prompts['attention_mask']
input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)
actor_model = AutoModelForCausalLM.from_pretrained(local_model_path)
actor_model.to(torch.bfloat16)
sampling_params = dict(n=1,
temperature=0,
top_p=1,
top_k=-1,
max_new_tokens=max_response_length,
presence_penalty=0.0,
frequency_penalty=0.0,
repetition_penalty=1.0,
skip_special_tokens=True,
spaces_between_special_tokens=True,
ignore_eos=False)
tensor_parallel_size = 4
device_mesh_kwargs = dict(mesh_shape=(1, tensor_parallel_size, 1), mesh_dim_names=["dp", "tp", "pp"])
inference_device_mesh_cpu = init_device_mesh("cpu", **device_mesh_kwargs)
for k in ["TORCHELASTIC_USE_AGENT_STORE"]:
if k in os.environ:
del os.environ[k]
print('building sglang rollout engine')
llm = VerlEngine(model_path=local_model_path,
dtype="bfloat16",
mem_fraction_static=0.5,
device_mesh_cpu=inference_device_mesh_cpu["tp"],
base_gpu_id=0,
gpu_id_step=1)
llm.release_memory_occupation()
print("start generation")
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
batch_size = input_ids.size(0)
generation_config = GenerationConfig(do_sample=False)
actor_model.cuda()
output = actor_model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_response_length,
# max_length=max_length,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
generation_config=generation_config,
# renormalize_logits=True,
output_scores=False, # this is potentially very large
return_dict_in_generate=True,
use_cache=False) # may OOM when use_cache = True
seq = output.sequences
response = seq[:, max_prompt_length:]
hf_response_tokens = tokenizer.batch_decode(response)
print(f"hf response: {hf_response_tokens}")
print(f"{sampling_params=}")
idx_list = []
batch_size = input_ids.shape[0]
pad_token_id = (tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id)
for i in range(batch_size):
idx_list.append(_pre_process_inputs(pad_token_id, input_ids[i]))
outputs = llm.generate(input_ids=idx_list, sampling_params=sampling_params)
sglang_response_tokens = []
for output in outputs:
print(f"{output=}")
generated_text = output["text"]
sglang_response_tokens.append(generated_text)
print(f"sglang response: {sglang_response_tokens}")
assert are_lists_similar(hf_response_tokens, sglang_response_tokens), \
f"Strings differ more than 10%:\n"
print("Check Pass")
def _pre_process_inputs(pad_token_id, prompt_token_ids: torch.Tensor):
# remove the left padding in the prompt token_id
# pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
token_ids = prompt_token_ids[non_pad_index:].tolist()
return token_ids
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
import transformers
from verl.third_party.vllm import LLM, vllm_version
from verl.utils.model import update_model_config
from vllm import SamplingParams
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import GenerationConfig
from verl.utils.torch_functional import pad_sequence_to_length
from verl.workers.rollout.vllm_rollout.vllm_rollout import _pre_process_inputs
def levenshtein(s1, s2):
m, n = len(s1), len(s2)
# Initialize matrix of zeros
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Initialize first column and first row of the matrix
for i in range(m + 1):
dp[i][0] = i # Deletion from s1 to empty string
for j in range(n + 1):
dp[0][j] = j # Insertion to s1 from empty string
# Compute the Levenshtein distance matrix
for i in range(1, m + 1):
for j in range(1, n + 1):
cost = 0 if s1[i - 1] == s2[j - 1] else 1 # No cost if characters match
dp[i][j] = min(
dp[i - 1][j] + 1, # Deletion
dp[i][j - 1] + 1, # Insertion
dp[i - 1][j - 1] + cost # Substitution
)
return dp[m][n]
def are_lists_similar(a, b):
if len(a) != len(b):
print("The lists are of different lengths.")
return False
total_length = 0
total_diff = 0
for s1, s2 in zip(a, b):
max_len = max(len(s1), len(s2))
total_length += max_len
diff = levenshtein(s1, s2)
total_diff += diff
print(f"Comparing strings:\n{s1}\n{s2}\nDifference: {diff} characters\n")
percentage_difference = (total_diff / total_length) * 100
print(f"Total difference: {percentage_difference:.2f}%")
return percentage_difference <= 10
def test_vllm_with_hf():
assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'
# fill rollout config
max_prompt_length = 16
max_response_length = 16
# Initialize model and token
local_cache_path = '~/.cache/verl/rlhf'
local_cache_path = os.path.expanduser(local_cache_path)
hdfs_path = 'deepseek-ai/deepseek-llm-7b-chat'
from verl.utils.fs import copy_to_local
local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
preencode_prompts = [
"Who won the Champions League in 2019?",
"The founder of Apple is",
"What's your name",
]
tokenizer.pad_token = tokenizer.eos_token
prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
input_ids = prompts['input_ids']
attention_mask = prompts['attention_mask']
input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)
actor_model = AutoModelForCausalLM.from_pretrained(local_model_path)
actor_model.to(torch.bfloat16)
actor_model_config = AutoConfig.from_pretrained(local_model_path)
temperature = 0
top_p = 1
kwargs = dict(n=1,
temperature=temperature,
top_p=top_p,
max_tokens=max_response_length,
logprobs=1,
ignore_eos=True)
if vllm_version in ('0.4.2', '0.5.4', '0.6.3'):
kwargs['detokenize'] = False
sampling_params = SamplingParams(**kwargs)
tensor_parallel_size = 4
llm = LLM(model=actor_model,
tokenizer=tokenizer,
model_hf_config=actor_model_config,
tensor_parallel_size=tensor_parallel_size,
dtype='bfloat16',
gpu_memory_utilization=0.1,
load_format='hf')
print('start generation')
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
batch_size = input_ids.size(0)
idx_list = []
# parse idx from torch.Tensor to List[List[str]]
for i in range(batch_size):
idx_list.append(_pre_process_inputs(tokenizer.pad_token_id, input_ids[i]))
outputs = llm.generate(prompt_token_ids=idx_list, sampling_params=sampling_params, use_tqdm=False)
vllm_output = outputs[0].cuda()
llm.free_cache_engine()
llm = None
import gc
torch.cuda.empty_cache()
gc.collect()
generation_config = GenerationConfig(do_sample=False)
actor_model.cuda()
output = actor_model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_response_length,
# max_length=max_length,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
generation_config=generation_config,
# renormalize_logits=True,
output_scores=False, # this is potentially very large
return_dict_in_generate=True,
use_cache=False) # may OOM when use_cache = True
seq = output.sequences
response = seq[:, max_prompt_length:]
hf_response_tokens = tokenizer.batch_decode(response)
vllm_response_tokens = tokenizer.batch_decode(vllm_output)
print(f'hf response: {hf_response_tokens}')
print(f'vllm response: {vllm_response_tokens}')
assert are_lists_similar(hf_response_tokens, vllm_response_tokens), \
f'Strings differ more than 10%:\n'
print('Check Pass')
# if __name__ == "__main__":
# test_vllm_with_hf()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
import transformers
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision, CPUOffload
from torch.distributed.fsdp.api import ShardingStrategy, ShardedStateDictConfig, StateDictType
from vllm import LLM, SamplingParams
from verl.utils.model import update_model_config
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import GenerationConfig
from verl.utils.distributed import initialize_global_process_group
from verl.utils.torch_functional import pad_sequence_to_length
def levenshtein(s1, s2):
m, n = len(s1), len(s2)
# Initialize matrix of zeros
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Initialize first column and first row of the matrix
for i in range(m + 1):
dp[i][0] = i # Deletion from s1 to empty string
for j in range(n + 1):
dp[0][j] = j # Insertion to s1 from empty string
# Compute the Levenshtein distance matrix
for i in range(1, m + 1):
for j in range(1, n + 1):
cost = 0 if s1[i - 1] == s2[j - 1] else 1 # No cost if characters match
dp[i][j] = min(
dp[i - 1][j] + 1, # Deletion
dp[i][j - 1] + 1, # Insertion
dp[i - 1][j - 1] + cost # Substitution
)
return dp[m][n]
def are_lists_similar(a, b):
if len(a) != len(b):
print("The lists are of different lengths.")
return False
total_length = 0
total_diff = 0
for s1, s2 in zip(a, b):
max_len = max(len(s1), len(s2))
total_length += max_len
diff = levenshtein(s1, s2)
total_diff += diff
print(f"Comparing strings:\n{s1}\n{s2}\nDifference: {diff} characters\n")
percentage_difference = (total_diff / total_length) * 100
print(f"Total difference: {percentage_difference:.2f}%")
return percentage_difference <= 15
def test_vllm_spmd():
assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'
local_rank, rank, world_size = initialize_global_process_group()
# Initialize model and token
local_cache_path = '~/.cache/verl/rlhf'
local_cache_path = os.path.expanduser(local_cache_path)
hdfs_path = 'Qwen/Qwen2-7B-Instruct'
from verl.utils.fs import copy_to_local
local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
tokenizer = AutoTokenizer.from_pretrained(local_model_path, padding_side='left', trust_remote_code=True)
actor_model = AutoModelForCausalLM.from_pretrained(local_model_path, trust_remote_code=True)
actor_model.to(torch.bfloat16)
# fill rollout config
max_prompt_length = 16
max_response_length = 32
preencode_prompts = [
"Who won the Champions League in 2019?",
"The founder of Apple is",
"What's your name",
]
tokenizer.pad_token = tokenizer.eos_token
prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
input_ids = prompts['input_ids']
attention_mask = prompts['attention_mask']
input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)
print('start generation')
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
temperature = 0
top_p = 1
kwargs = dict(n=1,
temperature=temperature,
top_p=top_p,
max_tokens=max_response_length,
logprobs=1,
ignore_eos=True)
tensor_parallel_size = 4
from torch.distributed.device_mesh import init_device_mesh
device_mesh = init_device_mesh('cuda', mesh_shape=(world_size,), mesh_dim_names=['fsdp'])
mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
fsdp_model = FSDP(actor_model,
use_orig_params=True,
auto_wrap_policy=None,
device_id=torch.cuda.current_device(),
sharding_strategy=ShardingStrategy.FULL_SHARD,
mixed_precision=mixed_precision,
cpu_offload=CPUOffload(offload_params=False),
sync_module_states=False,
device_mesh=device_mesh)
FSDP.set_state_dict_type(fsdp_model,
state_dict_type=StateDictType.SHARDED_STATE_DICT,
state_dict_config=ShardedStateDictConfig())
state_dict = fsdp_model.state_dict()
sampling_params = SamplingParams(**kwargs)
llm = LLM(
model=local_model_path,
enable_sleep_mode=True,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend="external_launcher",
dtype='bfloat16',
enforce_eager=True,
gpu_memory_utilization=0.8,
disable_custom_all_reduce=True,
disable_mm_preprocessor_cache=True,
skip_tokenizer_init=False,
enable_prefix_caching=True,
trust_remote_code=True,
seed=1,
)
outputs = llm.generate(preencode_prompts, sampling_params=sampling_params, use_tqdm=False)
vllm_response_tokens = []
for output in outputs:
generated_text = output.outputs[0].text
vllm_response_tokens.append(generated_text)
world_size = torch.distributed.get_world_size()
model = llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
model.load_weights(
((name, param.full_tensor() if world_size != 1 else param) for name, param in state_dict.items()))
outputs = llm.generate(preencode_prompts, sampling_params=sampling_params, use_tqdm=False)
verl_vllm_response_tokens = []
for output in outputs:
generated_text = output.outputs[0].text
verl_vllm_response_tokens.append(generated_text)
if torch.distributed.get_rank() == 0:
print(f'vllm response: {vllm_response_tokens}')
print(f'verl-vllm response: {verl_vllm_response_tokens}')
assert are_lists_similar(vllm_response_tokens, verl_vllm_response_tokens), \
f'Strings differ more than 10%:\n'
print('Check Pass')
torch.distributed.destroy_process_group()
if __name__ == "__main__":
test_vllm_spmd()
# Copyright 2024 PRIME team and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from verl.utils.reward_score import _default_compute_score
from verl.utils.reward_score.prime_code import apps_check_correctness
import asyncio
from verl.workers.reward_manager.prime import parallel_compute_score_async
prime_math_answers = [
"""\\begin{bmatrix}\n -7 & 6 & -8 \\\\\n 11 & -9 & 12 \\\\\n 15 & -16 & 19 \n \\end{bmatrix}""",
"""\\frac{\\sqrt{505}}{7}""", """x^2 + y^2 + 4x - 6y + 13"""
]
prime_math_gts = [
"""\\begin{pmatrix}\n -7 & 6 & -8 \\\\\n 11 & -9 & 12 \\\\\n 15 & -16 & 19\n \\end{pmatrix}""", # mat test
"""\\frac{\\sqrt{505}}{7}""", # frac test
"""(x + 2)^2 + (y - 3)^2 """ # symbolic test
]
prime_code_answers = [
"""import sys
from collections import deque
def main():
data = sys.stdin.read().split()
it = iter(data)
# Read start and target positions
x0, y0, x1, y1 = int(next(it)), int(next(it)), int(next(it)), int(next(it))
n = int(next(it))
allowed = set()
# The total number of allowed cells is at most 10^5.
for _ in range(n):
r = int(next(it))
a = int(next(it))
b = int(next(it))
for c in range(a, b + 1):
allowed.add((r, c))
# Directions for the king (8 neighboring cells)
directions = [(-1, -1), (-1, 0), (-1, 1),
(0, -1), (0, 1),
(1, -1), (1, 0), (1, 1)]
start = (x0, y0)
target = (x1, y1)
# BFS initialization
queue = deque()
queue.append((x0, y0, 0))
# Mark the starting cell as visited by removing it from allowed set.
allowed.discard(start)
while queue:
x, y, moves = queue.popleft()
if (x, y) == target:
print(moves)
return
for dx, dy in directions:
nx, ny = x + dx, y + dy
if (nx, ny) in allowed:
allowed.remove((nx, ny))
queue.append((nx, ny, moves + 1))
print(-1)
if __name__ == '__main__':
main()
"""
] * 2
prime_code_gts = [
"""{\n \"inputs\": [\n \"5 7 6 11\\n3\\n5 3 8\\n6 7 11\\n5 2 5\\n\",\n \"3 4 3 10\\n3\\n3 1 4\\n4 5 9\\n3 10 10\\n\",\n \"1 1 2 10\\n2\\n1 1 3\\n2 6 10\\n\",\n \"9 8 7 8\\n9\\n10 6 6\\n10 6 6\\n7 7 8\\n9 5 6\\n8 9 9\\n9 5 5\\n9 8 8\\n8 5 6\\n9 10 10\\n\",\n \"6 15 7 15\\n9\\n6 15 15\\n7 14 14\\n6 15 15\\n9 14 14\\n7 14 16\\n6 15 15\\n6 15 15\\n7 14 14\\n8 15 15\\n\",\n \"13 16 20 10\\n18\\n13 16 16\\n20 10 10\\n19 10 10\\n12 15 15\\n20 10 10\\n18 11 11\\n19 10 10\\n19 10 10\\n20 10 10\\n19 10 10\\n20 10 10\\n20 10 10\\n19 10 10\\n18 11 11\\n13 16 16\\n12 15 15\\n19 10 10\\n19 10 10\\n\",\n \"89 29 88 30\\n16\\n87 31 31\\n14 95 95\\n98 88 89\\n96 88 88\\n14 97 97\\n13 97 98\\n100 88 88\\n88 32 32\\n99 88 89\\n90 29 29\\n87 31 31\\n15 94 96\\n89 29 29\\n88 32 32\\n97 89 89\\n88 29 30\\n\",\n \"30 14 39 19\\n31\\n35 7 11\\n37 11 12\\n32 13 13\\n37 5 6\\n46 13 13\\n37 14 14\\n31 13 13\\n43 13 19\\n45 15 19\\n46 13 13\\n32 17 17\\n41 14 19\\n30 14 14\\n43 13 17\\n34 16 18\\n44 11 19\\n38 13 13\\n40 12 20\\n37 16 18\\n46 16 18\\n34 10 14\\n36 9 10\\n36 15 19\\n38 15 19\\n42 13 19\\n33 14 15\\n35 15 19\\n33 17 18\\n39 12 20\\n36 5 7\\n45 12 12\\n\",\n \"2 1 1 1\\n2\\n1 1 2\\n2 1 2\\n\",\n \"1 1 1 2\\n5\\n1000000000 1 10000\\n19920401 1188 5566\\n1000000000 1 10000\\n1 1 10000\\n5 100 200\\n\",\n \"1 1 1000000000 2\\n5\\n1000000000 1 10000\\n19920401 1188 5566\\n1000000000 1 10000\\n1 1 10000\\n5 100 200\\n\"\n ],\n \"outputs\": [\n \"4\\n\",\n \"6\\n\",\n \"-1\\n\",\n \"2\\n\",\n \"1\\n\",\n \"-1\\n\",\n \"1\\n\",\n \"9\\n\",\n \"1\\n\",\n \"1\\n\",\n \"-1\\n\"\n ]\n}""", # A correct sample
"""{\n \"inputs\": [\n \"5 7 6 11\\n3\\n5 3 8\\n6 7 11\\n5 2 5\\n\",\n \"3 4 3 10\\n3\\n3 1 4\\n4 5 9\\n3 10 10\\n\",\n \"1 1 2 10\\n2\\n1 1 3\\n2 6 10\\n\",\n \"9 8 7 8\\n9\\n10 6 6\\n10 6 6\\n7 7 8\\n9 5 6\\n8 9 9\\n9 5 5\\n9 8 8\\n8 5 6\\n9 10 10\\n\",\n \"6 15 7 15\\n9\\n6 15 15\\n7 14 14\\n6 15 15\\n9 14 14\\n7 14 16\\n6 15 15\\n6 15 15\\n7 14 14\\n8 15 15\\n\",\n \"13 16 20 10\\n18\\n13 16 16\\n20 10 10\\n19 10 10\\n12 15 15\\n20 10 10\\n18 11 11\\n19 10 10\\n19 10 10\\n20 10 10\\n19 10 10\\n20 10 10\\n20 10 10\\n19 10 10\\n18 11 11\\n13 16 16\\n12 15 15\\n19 10 10\\n19 10 10\\n\",\n \"89 29 88 30\\n16\\n87 31 31\\n14 95 95\\n98 88 89\\n96 88 88\\n14 97 97\\n13 97 98\\n100 88 88\\n88 32 32\\n99 88 89\\n90 29 29\\n87 31 31\\n15 94 96\\n89 29 29\\n88 32 32\\n97 89 89\\n88 29 30\\n\",\n \"30 14 39 19\\n31\\n35 7 11\\n37 11 12\\n32 13 13\\n37 5 6\\n46 13 13\\n37 14 14\\n31 13 13\\n43 13 19\\n45 15 19\\n46 13 13\\n32 17 17\\n41 14 19\\n30 14 14\\n43 13 17\\n34 16 18\\n44 11 19\\n38 13 13\\n40 12 20\\n37 16 18\\n46 16 18\\n34 10 14\\n36 9 10\\n36 15 19\\n38 15 19\\n42 13 19\\n33 14 15\\n35 15 19\\n33 17 18\\n39 12 20\\n36 5 7\\n45 12 12\\n\",\n \"2 1 1 1\\n2\\n1 1 2\\n2 1 2\\n\",\n \"1 1 1 2\\n5\\n1000000000 1 10000\\n19920401 1188 5566\\n1000000000 1 10000\\n1 1 10000\\n5 100 200\\n\",\n \"1 1 1000000000 2\\n5\\n1000000000 1 10000\\n19920401 1188 5566\\n1000000000 1 10000\\n1 1 10000\\n5 100 200\\n\"\n ],\n \"outputs\": [\n \"4\\n\",\n \"6\\n\",\n \"-1\\n\",\n \"-1\\n\",\n \"1\\n\",\n \"-1\\n\",\n \"1\\n\",\n \"9\\n\",\n \"1\\n\",\n \"1\\n\",\n \"-1\\n\"\n ]\n}"""
] # A failed sample with first several in-out passed
prime_code_scores = [1.0, 0.9]
def test_parallelism():
"""
Test if process pool works properly
"""
sequences_str = []
ground_truth = []
data_sources = []
while len(sequences_str) < 32:
sequences_str.extend(prime_code_answers)
ground_truth.extend(prime_code_gts)
data_sources.extend(['codecontests'] * len(prime_code_answers))
sequences_str.extend(prime_math_answers)
ground_truth.extend(prime_math_gts)
data_sources.extend(['numina_aops_forum'] * len(prime_math_answers))
scores = asyncio.run(
parallel_compute_score_async(_default_compute_score,
sequences_str,
ground_truth,
data_sources,
num_processes=16))
print(scores)
def test_prime_code():
"""
Test PRIME code sandbox.
"""
data_source = 'codecontests'
for completion, ground_truth, score_ in zip(prime_code_answers, prime_code_gts, prime_code_scores):
score = _default_compute_score(data_source, completion, ground_truth)
assert float(score) == score_
def test_check_correctness():
completion = prime_code_answers[0]
ground_truth = json.loads(prime_code_gts[0])
ground_truth_single = {'inputs': ground_truth['inputs'][:1], 'outputs': ground_truth['outputs'][:1]}
res, meta = apps_check_correctness(in_outs=ground_truth_single, generation=completion, timeout=5, debug=False)
print(res, meta)
def test_prime_math():
data_source = 'numina_aops_forum'
for completion, ground_truth in zip(prime_math_answers, prime_math_gts):
score = _default_compute_score(data_source, completion, ground_truth)
assert float(score) == 1.0
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
license_head_bytedance = "Copyright 2024 Bytedance Ltd. and/or its affiliates"
license_head_bytedance_25 = "Copyright 2025 Bytedance Ltd. and/or its affiliates"
# Add custom license headers below
license_head_prime = "Copyright 2024 PRIME team and/or its affiliates"
license_head_individual = "Copyright 2025 Individual Contributor:"
license_headers = [license_head_bytedance, license_head_bytedance_25, license_head_prime, license_head_individual]
from pathlib import Path
from argparse import ArgumentParser
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--directory', '-d', required=True, type=str)
args = parser.parse_args()
directory_in_str = args.directory
pathlist = Path(directory_in_str).glob('**/*.py')
for path in pathlist:
# because path is object not string
path_in_str = str(path.absolute())
print(path_in_str)
with open(path_in_str, 'r', encoding='utf-8') as f:
file_content = f.read()
has_license = False
for lh in license_headers:
if lh in file_content:
has_license = True
break
assert has_license, f'file {path_in_str} does not contain license'
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def test_import():
import verl
print(verl.__version__)
def test_single_controller_import():
import verl.single_controller
print(verl.single_controller.__version__)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment