Initial commit

f87b35b2 · jerrrrry · f87b35b2 · f87b35b2 · f87b35b2 · f87b35b2
Commit f87b35b2 authored Apr 17, 2025 by jerrrrry
20 changed files
--- a/tests/ray/detached_worker/README.md
+++ b/tests/ray/detached_worker/README.md
+# Detached Worker
+## How to run (Only on a single node)
+- Start a local ray cluster: 
+```bash
+ray start --head --port=6379
+```
+- Run the server
+```bash
+python3 server.py
+```
+- On another terminal, Run the client
+```bash
+python3 client.py
+```
--- a/tests/ray/detached_worker/client.py
+++ b/tests/ray/detached_worker/client.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+In client, we can get the server handler and send RPC request
+"""
+import ray
+import torch
+from verl import DataProto
+from verl.single_controller.ray import RayClassWithInitArgs
+from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+from tensordict import TensorDict
+from server import Trainer
+def compute_position_id_with_mask(mask):
+    return torch.clip(torch.cumsum(mask, dim=-1) - 1, min=0, max=None)
+if __name__ == '__main__':
+    ray.init(address='auto', namespace='verl')
+    # get the worker group using names
+    worker_names = ['trainerTrainer_0:0', 'trainerTrainer_0:1']
+    cls_with_init_args = RayClassWithInitArgs(cls=Trainer)
+    worker_group = NVMegatronRayWorkerGroup.from_detached(worker_names=worker_names,
+                                                          ray_cls_with_init=cls_with_init_args)
+    batch_size = 16
+    sequence_length = 1024
+    # give Trainer some data to train
+    input_ids = torch.randint(low=0, high=256, size=(batch_size, sequence_length), dtype=torch.int64, device='cuda')
+    attention_mask = torch.ones_like(input_ids)
+    position_ids = compute_position_id_with_mask(attention_mask)
+    data = DataProto(batch=TensorDict(
+        {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids
+        }, batch_size=batch_size),
+                     meta_info={})
+    output = worker_group.train_model(data)
+    print(output)
--- a/tests/ray/detached_worker/run.sh
+++ b/tests/ray/detached_worker/run.sh
+#!/bin/bash
+ray start --head --port=6379
+python3 server.py
+python3 client.py
+ray stop --force
\ No newline at end of file
--- a/tests/ray/detached_worker/server.py
+++ b/tests/ray/detached_worker/server.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Server starts a Trainer. Client sends data to the server to train.
+"""
+import os
+os.environ['MEGATRON_USE_CUDA_TIMER'] = '0'
+os.environ['MEGATRON_START_PROCESS_TIMER'] = 'False'
+os.environ['NCCL_DEBUG'] = 'WARN'
+import torch
+from torch import nn
+import ray
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool
+from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+from verl.single_controller.base.megatron.worker import MegatronWorker
+from verl.single_controller.base.decorator import register, Dispatch
+from verl import DataProto
+from verl.models.llama.megatron import ParallelLlamaForCausalLMRmPadPP
+from megatron.core import parallel_state as mpu
+from megatron.core.models.gpt.gpt_model import ModelType
+from megatron.core import tensor_parallel
+from verl.utils.megatron_utils import get_model, init_megatron_optim_config, mcore_model_parallel_config
+from verl.utils.megatron.optimizer import get_megatron_optimizer
+from transformers import LlamaConfig
+from omegaconf import OmegaConf
+from tensordict import TensorDict
+@ray.remote
+class Trainer(MegatronWorker):
+    def __init__(self):
+        super().__init__()
+        if not torch.distributed.is_initialized():
+            rank = int(os.environ['LOCAL_RANK'])
+            torch.distributed.init_process_group(backend="nccl")
+            torch.cuda.set_device(rank)
+            os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+            mpu.initialize_model_parallel(
+                tensor_model_parallel_size=2,
+                pipeline_model_parallel_size=1,
+                virtual_pipeline_model_parallel_size=None,
+                pipeline_model_parallel_split_rank=None,
+                use_sharp=False,
+                context_parallel_size=1,
+                expert_model_parallel_size=1,
+                nccl_communicator_config_path=None,
+            )
+            tensor_parallel.model_parallel_cuda_manual_seed(10)
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        actor_model_config = LlamaConfig(vocab_size=256,
+                                         hidden_size=2048,
+                                         intermediate_size=5504,
+                                         num_hidden_layers=24,
+                                         num_attention_heads=16,
+                                         num_key_value_heads=16)
+        megatron_config = mcore_model_parallel_config(sequence_parallel=True, params_dtype=torch.bfloat16)
+        self.megatron_config = megatron_config
+        def megatron_actor_model_provider(pre_process, post_process):
+            # vpp is not supported yet because it will hang for some reason. Need debugging
+            vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank()  # this will be set inside get_model
+            # this_megatron_config = copy.deepcopy(megatron_config)
+            # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
+            parallel_model = ParallelLlamaForCausalLMRmPadPP(config=actor_model_config,
+                                                             megatron_config=megatron_config,
+                                                             pre_process=pre_process,
+                                                             post_process=post_process)
+            parallel_model.cuda()
+            return parallel_model
+        actor_module = get_model(model_provider_func=megatron_actor_model_provider,
+                                 model_type=ModelType.encoder_or_decoder,
+                                 wrap_with_ddp=True)
+        actor_module = nn.ModuleList(actor_module)
+        optim_config = OmegaConf.create({'lr': 1e-6, 'clip_grad': 1.0})
+        optim_config = init_megatron_optim_config(optim_config)
+        self.optimizer_config = optim_config
+        actor_optimizer = get_megatron_optimizer(model=actor_module, config=optim_config)
+        self.model = actor_module[0]
+        self.optimizer = actor_optimizer
+    @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+    def train_model(self, data: DataProto) -> DataProto:
+        input_ids = data.batch['input_ids']
+        attention_mask = data.batch['attention_mask']
+        position_ids = data.batch['position_ids']
+        self.optimizer.zero_grad()
+        self.model.zero_grad_buffer(
+            zero_buffer=(not self.optimizer_config.use_distributed_optimizer
+                        ))  # use use_contiguous_buffers_in_local_ddp and no overlap_dp_param_comm
+        # update for 1 iteration
+        output = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids).logits
+        output.mean().backward()
+        update_successful, grad_norm, num_zeros_in_grad = self.optimizer.step(self.megatron_config,
+                                                                              self.megatron_config.timers)
+        return DataProto(batch=TensorDict({'loss': output.detach()}, batch_size=output.shape[0]))
+if __name__ == '__main__':
+    ray.init(address='auto', namespace='verl')
+    resource_pool = RayResourcePool(process_on_nodes=[2], detached=True)
+    cls_with_init_args = RayClassWithInitArgs(cls=Trainer)
+    worker_group = NVMegatronRayWorkerGroup(
+        resource_pool=resource_pool,
+        ray_cls_with_init=cls_with_init_args,
+        name_prefix='trainer',
+        detached=True,
+    )
+    worker_group.init_model()
+    worker_names = worker_group.worker_names
+    print(worker_names)
--- a/tests/ray/test_check_worker_alive.py
+++ b/tests/ray/test_check_worker_alive.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import os
+import subprocess
+def test():
+    wait_time = 10
+    my_env = os.environ.copy()
+    my_env["WAIT_TIME"] = str(wait_time)
+    p = subprocess.Popen(["python3", "-u", "./check_worker_alive/main.py"], env=my_env, stdout=subprocess.PIPE)
+    count = 0
+    while b"foo started" not in p.stdout.read():
+        time.sleep(1)
+        count += 1
+        if count > 40:
+            raise RuntimeError("timeout for start foo in check_worker_alive/main.py")
+    print(
+        time.time(),
+        f"wait 1.5 wait time {wait_time*1.5} to let signal returned to process but still not exceed process wait time")
+    time.sleep(wait_time * 1.5)
+    print(time.time(), f"start checking")
+    assert p.poll() is not None, f"process {p} still alive, expecting signal raised abort"
+    assert p.returncode != 0, f"process {p} exit with code 0, expecting not-zero exit code"
+    print(f"test passed")
+if __name__ == "__main__":
+    test()
--- a/tests/ray/test_colocated_workers.py
+++ b/tests/ray/test_colocated_workers.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import register, Dispatch
+from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, create_colocated_worker_cls
+from verl import DataProto
+@ray.remote
+class Actor(Worker):
+    def __init__(self) -> None:
+        super().__init__()
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    def add(self, data: DataProto):
+        data.batch['a'] += self.rank
+        return data
+@ray.remote
+class Critic(Worker):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    def sub(self, data: DataProto):
+        data.batch['a'] -= self.config['b']
+        return data
+def test_colocated_workers():
+    ray.init()
+    import torch
+    data = DataProto.from_dict({'a': torch.zeros(10)})
+    # create separate workers on the same resource pool
+    actor_cls = RayClassWithInitArgs(cls=Actor)
+    critic_cls = RayClassWithInitArgs(cls=Critic, config={'b': 10})
+    resource_pool = RayResourcePool(process_on_nodes=[2])
+    actor_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=actor_cls)
+    critic_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=critic_cls)
+    expected_actor_output = actor_wg.add(data)
+    expected_critic_output = critic_wg.sub(data)
+    # create colocated workers
+    cls_dict = {'actor': actor_cls, 'critic': critic_cls}
+    ray_cls_with_init = create_colocated_worker_cls(cls_dict)
+    wg_dict = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
+    spawn_wg = wg_dict.spawn(prefix_set=cls_dict.keys())
+    colocated_actor_wg = spawn_wg['actor']
+    colocated_critic_wg = spawn_wg['critic']
+    actor_output = colocated_actor_wg.add(data)
+    critic_output = colocated_critic_wg.sub(data)
+    torch.testing.assert_close(expected_actor_output.batch, actor_output.batch, atol=0, rtol=0)
+    torch.testing.assert_close(expected_critic_output.batch, critic_output.batch, atol=0, rtol=0)
+    ray.shutdown()
--- a/tests/ray/test_data_transfer.py
+++ b/tests/ray/test_data_transfer.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+In this test, we instantiate a data parallel worker with 8 GPUs
+"""
+from verl.single_controller.base import Worker
+from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool
+from verl.single_controller.base.decorator import Dispatch, register
+import ray
+import torch
+from torch import distributed as dist
+from verl import DataProto
+from verl.utils.ray_utils import parallel_put
+from codetiming import Timer
+import tensordict
+@ray.remote
+class DummyWorker(Worker):
+    def __init__(self):
+        super().__init__()
+        dist.init_process_group()
+    @register(dispatch_mode=Dispatch.DP_COMPUTE, blocking=False)
+    def do_nothing(self, data):
+        for key in data.batch.keys():
+            data.batch[key] += 1
+        if tensordict.__version__ >= '0.5.0':
+            data.batch = data.batch.consolidate()
+        return data
+def test_data_transfer():
+    ray.init()
+    # construct resource pool
+    resource_pool = RayResourcePool([8])
+    cls_with_init = RayClassWithInitArgs(cls=DummyWorker)
+    # construct worker group
+    wg = RayWorkerGroup(resource_pool, cls_with_init)
+    # this is real dataset size
+    batch_size = 4096
+    seqlen = 32768
+    data_dict = {}
+    for i in range(2):
+        data_dict[str(i)] = torch.randint(0, 10000, (batch_size, seqlen))
+    data = DataProto.from_dict(tensors=data_dict)
+    print(data)
+    # we manually split data here and send to each worker
+    data_list = data.chunk(wg.world_size)
+    for i in range(wg.world_size):
+        # consolidate is necessary
+        if tensordict.__version__ >= '0.5.0':
+            data_list[i].batch = data_list[i].batch.consolidate()
+    with Timer(name='ray.pickle', initial_text=True):
+        for i in range(wg.world_size):
+            ray.cloudpickle.pickle.dumps(data_list[i])
+    with Timer(name='raw.pickle', initial_text=True):
+        import pickle
+        for i in range(wg.world_size):
+            pickle.dumps(data_list[i])
+    # we put in advance
+    with Timer(name='put', initial_text=True):
+        # takes around 40 seconds
+        data_list_ref = parallel_put(data_list)
+        # for i in range(wg.world_size):
+        #     data_list[i] = ray.put(data_list[i])
+    with Timer(name='launch', initial_text=True):
+        output_ref = wg.do_nothing(data_list_ref)
+    with Timer(name='get', initial_text=True):
+        # takes around 40 seconds
+        output_lst = ray.get(output_ref)
+    for input_data, output_data in zip(data_list, output_lst):
+        for key in input_data.batch.keys():
+            assert torch.all(torch.eq(input_data.batch[key] + 1,
+                                      output_data.batch[key])), (input_data.batch[key], output_data.batch[key], key)
+    ray.shutdown()
--- a/tests/ray/test_driverfunc_to_worker.py
+++ b/tests/ray/test_driverfunc_to_worker.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ray
+import torch
+from verl import DataProto
+from tensordict import TensorDict
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs
+from verl.single_controller.ray import RayWorkerGroup
+os.environ['RAY_DEDUP_LOGS'] = '0'
+os.environ['NCCL_DEBUG'] = 'WARN'
+@ray.remote
+class ModelActor(Worker):
+    def __init__(self):
+        pass
+class HackSelf():
+    def __init__(self):
+        pass
+def get_aux_metrics(self, test_proto):
+    sequence_ids = test_proto.batch["sequence_ids"]
+    decode_count = []
+    for i in range(sequence_ids.size(0)):
+        decode_count.append(len(sequence_ids[i].tolist()))
+    ret_proto = DataProto(batch=TensorDict({
+        "sequence_ids": sequence_ids,
+        "decode_count": torch.tensor(decode_count)
+    },
+                                           batch_size=sequence_ids.size(0)))
+    return ret_proto
+def test():
+    # construct model
+    ray.init()
+    # create 2 workers, each hold a GPU
+    resource_pool = RayResourcePool([2], use_gpu=True, name_prefix='a')
+    class_with_args = RayClassWithInitArgs(cls=ModelActor)
+    shard_wg = RayWorkerGroup(resource_pool, class_with_args)
+    test_bs = 8
+    test_proto = DataProto(TensorDict({
+        "sequence_ids": torch.ones([test_bs, 2048], dtype=torch.int64),
+    },
+                                      batch_size=test_bs),
+                           meta_info={"query_length": 1536})
+    # Sharding among different ranks
+    ret_proto1 = shard_wg.execute_with_func_generator(get_aux_metrics, test_proto)
+    # compare execute on driver
+    hs = HackSelf()
+    ret_proto2 = get_aux_metrics(hs, test_proto)
+    torch.testing.assert_close(ret_proto1.batch["decode_count"], ret_proto2.batch["decode_count"])
+    ray.shutdown()
--- a/tests/ray/test_high_level_scheduling_api.py
+++ b/tests/ray/test_high_level_scheduling_api.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import ray
+from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, merge_resource_pool
+from verl.single_controller.base.worker import Worker
+@ray.remote
+class TestActor(Worker):
+    # TODO: pass *args and **kwargs is bug prone and not very convincing
+    def __init__(self, cuda_visible_devices=None) -> None:
+        super().__init__(cuda_visible_devices)
+    def get_node_id(self):
+        return ray.get_runtime_context().get_node_id()
+def test():
+    ray.init()
+    # test single-node-no-partition
+    print(f"test single-node-no-partition")
+    resource_pool = RayResourcePool([8], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestActor)
+    print("create actor worker group")
+    actor_wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="high_level_api_actor")
+    print("create critic worker group")
+    critic_wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="hight_level_api_critic")
+    print("create rm worker group")
+    rm_wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="high_level_api_rm")
+    print("create ref worker group")
+    ref_wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="high_level_api_ref")
+    assert actor_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert critic_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert rm_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert ref_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    del actor_wg
+    del critic_wg
+    del rm_wg
+    del ref_wg
+    [ray.util.remove_placement_group(pg) for pg in resource_pool.get_placement_groups()]
+    print("wait 5s to remove placemeng_group")
+    time.sleep(5)
+    # test single-node-multi-partition
+    print(f"test single-node-multi-partition")
+    rm_resource_pool = RayResourcePool([4], use_gpu=True, name_prefix="rm")
+    ref_resource_pool = RayResourcePool([4], use_gpu=True, name_prefix="ref")
+    total_resource_pool = merge_resource_pool(rm_resource_pool, ref_resource_pool)
+    assert rm_resource_pool.world_size == 4
+    assert ref_resource_pool.world_size == 4
+    assert total_resource_pool.world_size == 8
+    actor_wg = RayWorkerGroup(total_resource_pool, class_with_args, name_prefix="high_level_api_actor")
+    critic_wg = RayWorkerGroup(total_resource_pool, class_with_args, name_prefix="high_level_api_critic")
+    rm_wg = RayWorkerGroup(rm_resource_pool, class_with_args, name_prefix="high_level_api_rm")
+    ref_wg = RayWorkerGroup(ref_resource_pool, class_with_args, name_prefix="high_level_api_ref")
+    assert actor_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert critic_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert rm_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(4)]
+    assert ref_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(4, 8)]
+    ray.shutdown()
--- a/tests/ray/test_ray_local_envs.py
+++ b/tests/ray/test_ray_local_envs.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+e2e test verl.single_controller.ray
+"""
+import os
+import ray
+from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.base.decorator import register, Dispatch, collect_all_to_all, Execute
+@ray.remote
+class TestActor(Worker):
+    def __init__(self) -> None:
+        super().__init__()
+    def getenv(self, key):
+        val = os.getenv(key, f"{key} not set")
+        return val
+def test_basics():
+    ray.init()
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestActor)
+    worker_group = RayWorkerGroup(resource_pool=resource_pool,
+                                  ray_cls_with_init=class_with_args,
+                                  name_prefix="worker_group_basic")
+    output = worker_group.execute_all_sync("getenv", key="RAY_LOCAL_WORLD_SIZE")
+    assert output == ["4", "4", "4", "4"]
+    output = worker_group.execute_all_sync("getenv", key="RAY_LOCAL_RANK")
+    assert set(output) == set(["0", "1", "2", "3"])
+    ray.shutdown()
+if __name__ == '__main__':
+    test_basics()
--- a/tests/ray/test_rvdz.py
+++ b/tests/ray/test_rvdz.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+@ray.remote
+class TestWorker:
+    def __init__(self, rank, world_size, group_name):
+        self.rank = rank
+        self.world_size = world_size
+        self.group_name = group_name
+        self.communicator = None
+    def init(self):
+        from verl.utils.rendezvous.ray_backend import create_nccl_communicator_in_ray
+        self.communicator = create_nccl_communicator_in_ray(self.rank, self.world_size, self.group_name)
+    def test(self):
+        if self.communicator is None:
+            return None
+        return self.communicator.rank_id()
+def test_rvdz():
+    ray.init()
+    group_name = "test_group"
+    world_size = 2
+    workers = [TestWorker.options(num_gpus=1).remote(rank, world_size, group_name) for rank in range(world_size)]
+    ray.get([worker.init.remote() for worker in workers])
+    ranks = ray.get([worker.test.remote() for worker in workers])
+    assert ranks == [0, 1], f"expecting [0, 1], got {ranks}"
+    ray.shutdown()
--- a/tests/ray/test_worker_group_basics.py
+++ b/tests/ray/test_worker_group_basics.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+e2e test verl.single_controller.ray
+"""
+import torch
+import ray
+from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.base.decorator import register, Dispatch, collect_all_to_all, Execute
+def two_to_all_dispatch_fn(worker_group, *args, **kwargs):
+    """
+    Assume the input is a list of 2. Duplicate the input interleaved and pass to each worker.
+    """
+    for arg in args:
+        assert len(arg) == 2
+        for i in range(worker_group.world_size - 2):
+            arg.append(arg[i % 2])
+    for k, v in kwargs.items():
+        assert len(v) == 2
+        for i in range(worker_group.world_size - 2):
+            v.append(v[i % 2])
+    return args, kwargs
+@ray.remote
+class TestActor(Worker):
+    # TODO: pass *args and **kwargs is bug prone and not very convincing
+    def __init__(self, x) -> None:
+        super().__init__()
+        self._x = x
+    def foo(self, y):
+        return self._x + y
+    @register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)
+    def foo_rank_zero(self, x, y):
+        return self._x + y + x
+    @register(Dispatch.ONE_TO_ALL, blocking=False)
+    def foo_one_to_all(self, x, y):
+        return self._x + y + x
+    @register(Dispatch.ALL_TO_ALL, blocking=False)
+    def foo_all_to_all(self, x, y):
+        return self._x + y + x
+    @register(dispatch_mode={'dispatch_fn': two_to_all_dispatch_fn, 'collect_fn': collect_all_to_all})
+    def foo_custom(self, x, y):
+        return self._x + y + x
+@ray.remote(num_gpus=0.1)
+def remote_call_wg(worker_names):
+    class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)
+    worker_group = RayWorkerGroup.from_detached(worker_names=worker_names, ray_cls_with_init=class_with_args)
+    print(worker_group.worker_names)
+    output_ref = worker_group.foo_custom(x=[1, 2], y=[5, 6])
+    assert output_ref == [8, 10, 8, 10]
+    output_ref = worker_group.foo_rank_zero(x=1, y=2)
+    assert output_ref == 5
+    return worker_group.worker_names
+def add_one(data):
+    data = data.to("cuda")
+    data += 1
+    data = data.to("cpu")
+    return data
+def test_basics():
+    ray.init()
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)
+    worker_group = RayWorkerGroup(resource_pool=resource_pool,
+                                  ray_cls_with_init=class_with_args,
+                                  name_prefix="worker_group_basic")
+    print(worker_group.worker_names)
+    # this will wait for all the results
+    output = worker_group.execute_all_sync("foo", y=3)
+    assert output == [5, 5, 5, 5]
+    # this is a list of object reference. It won't block.
+    output_ref = worker_group.execute_all_async("foo", y=4)
+    print(output_ref)
+    assert ray.get(output_ref) == [6, 6, 6, 6]
+    output_ref = worker_group.foo_one_to_all(x=1, y=2)
+    assert ray.get(output_ref) == [5, 5, 5, 5]
+    output_ref = worker_group.foo_all_to_all(x=[1, 2, 3, 4], y=[5, 6, 7, 8])
+    assert ray.get(output_ref) == [8, 10, 12, 14]
+    print(ray.get(remote_call_wg.remote(worker_group.worker_names)))
+    output = worker_group.execute_func_rank_zero(add_one, torch.ones(2, 2))
+    torch.testing.assert_close(output, torch.ones(2, 2) + 1)
+    ray.shutdown()
+if __name__ == '__main__':
+    test_basics()
--- a/tests/ray/test_worker_group_torch.py
+++ b/tests/ray/test_worker_group_torch.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+os.environ['RAY_DEDUP_LOGS'] = '0'
+os.environ['NCCL_DEBUG'] = 'WARN'
+import torch
+import torch.distributed
+import ray
+from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.base.worker import Worker
+@ray.remote
+class TestAllGatherActor(Worker):
+    def __init__(self, size) -> None:
+        super().__init__()
+        self.size = size
+    def init(self):
+        torch.distributed.init_process_group()
+        self.tensor = torch.zeros(size=(self.size,), dtype=torch.int64, device='cuda')
+        self.tensor += self.rank
+    def all_gather(self):
+        world_size = self._world_size
+        output = torch.zeros(size=(self.tensor.shape[0] * world_size,),
+                             dtype=self.tensor.dtype,
+                             device=self.tensor.device)
+        torch.distributed.all_gather_into_tensor(output, self.tensor, async_op=False)
+        return output
+@ray.remote
+class TestAllGatherActorV2(Worker):
+    def __init__(self, size) -> None:
+        super().__init__()
+        self.size = size
+        torch.distributed.init_process_group()
+        self.tensor = torch.zeros(size=(self.size,), dtype=torch.int64, device='cuda')
+        self.tensor += self.rank
+    def all_gather(self):
+        world_size = self._world_size
+        output = torch.zeros(size=(self.tensor.shape[0] * world_size,),
+                             dtype=self.tensor.dtype,
+                             device=self.tensor.device)
+        torch.distributed.all_gather_into_tensor(output, self.tensor, async_op=False)
+        return output
+def test_all_gather_torch():
+    """
+    In this test, we instantiate 4 GPUs in a group and test the all_gather
+    """
+    ray.init()
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestAllGatherActor, size=2)
+    worker_group = RayWorkerGroup(resource_pool, class_with_args, name_prefix="worker_group_torch")
+    worker_group.execute_all_sync('init')
+    output = worker_group.execute_all_sync('all_gather')
+    for i in range(1, len(output)):
+        assert torch.all(output[i] == output[0])
+    output = output[0].cpu()
+    print(output)
+    assert torch.all(output == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3], dtype=torch.int64))
+    ray.shutdown()
+def test_all_gather_torch_v2():
+    """
+    In this test, we instantiate 4 GPUs in a group and test the all_gather
+    """
+    ray.init()
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestAllGatherActorV2, size=2)
+    worker_group = RayWorkerGroup(resource_pool, class_with_args, name_prefix="worker_group_torch")
+    output = worker_group.execute_all_sync('all_gather')
+    for i in range(1, len(output)):
+        assert torch.all(output[i] == output[0])
+    output = output[0].cpu()
+    print(output)
+    assert torch.all(output == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3], dtype=torch.int64))
+    ray.shutdown()
--- a/tests/rollout/run_fsdp_vllm.py
+++ b/tests/rollout/run_fsdp_vllm.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision, CPUOffload
+from torch.distributed.fsdp.api import ShardingStrategy, ShardedStateDictConfig, StateDictType
+import torch
+from verl.utils.distributed import initialize_global_process_group
+from verl.third_party.vllm import LLM
+from vllm import SamplingParams
+import time
+import torch.distributed as dist
+def main():
+    assert torch.cuda.is_available(), 'CUDA must be present to run FSDP vLLM example'
+    local_rank, rank, world_size = initialize_global_process_group()
+    local_cache_path = '~/.cache/verl/rlhf'
+    local_cache_path = os.path.expanduser(local_cache_path)
+    hdfs_path = 'Qwen/Qwen2-7B-Instruct'
+    from verl.utils.fs import copy_to_local
+    local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
+    actor_model_config = AutoConfig.from_pretrained(local_model_path, trust_remote_code=True)
+    with torch.device("cuda"):
+        actor_model = AutoModelForCausalLM.from_pretrained(local_model_path, trust_remote_code=True)
+        actor_model.to(torch.bfloat16)
+    max_prompt_length = 16
+    response_length = 32
+    preencode_prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    tokenizer.pad_token = tokenizer.eos_token
+    prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
+    input_ids = prompts['input_ids']
+    attention_mask = prompts['attention_mask']
+    from verl.utils.torch_functional import pad_sequence_to_length
+    input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True).cuda()
+    attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True).cuda()
+    from transformers import GenerationConfig
+    generation_config = GenerationConfig(do_sample=False)
+    actor_model.cuda()
+    output = actor_model.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        max_new_tokens=32,
+        # max_length=max_length,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.pad_token_id,
+        generation_config=generation_config,
+        # renormalize_logits=True,
+        output_scores=False,  # this is potentially very large
+        return_dict_in_generate=True,
+        use_cache=False)  # may OOM when use_cache = True
+    seq = output.sequences
+    response = seq[:, max_prompt_length:]
+    print(f'hf response: {tokenizer.batch_decode(response)}')
+    tensor_model_parallel_size = 4
+    from torch.distributed.device_mesh import init_device_mesh
+    device_mesh = init_device_mesh('cuda', mesh_shape=(world_size,), mesh_dim_names=['fsdp'])
+    mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
+    fsdp_model = FSDP(actor_model,
+                      use_orig_params=True,
+                      auto_wrap_policy=None,
+                      device_id=torch.cuda.current_device(),
+                      sharding_strategy=ShardingStrategy.FULL_SHARD,
+                      mixed_precision=mixed_precision,
+                      cpu_offload=CPUOffload(offload_params=False),
+                      sync_module_states=False,
+                      device_mesh=device_mesh)
+    FSDP.set_state_dict_type(fsdp_model,
+                             state_dict_type=StateDictType.SHARDED_STATE_DICT,
+                             state_dict_config=ShardedStateDictConfig())
+    state_dict = fsdp_model.state_dict()
+    sampling_params = SamplingParams(temperature=0,
+                                     top_p=1,
+                                     n=1,
+                                     max_tokens=response_length,
+                                     logprobs=1,
+                                     ignore_eos=True,
+                                     detokenize=False)
+    print(actor_model_config)
+    llm = LLM(model=None,
+              tokenizer=tokenizer,
+              model_hf_config=actor_model_config,
+              tensor_parallel_size=tensor_model_parallel_size,
+              enforce_eager=True,
+              dtype='bfloat16',
+              load_format='dummy_dtensor',
+              gpu_memory_utilization=0.8,
+              trust_remote_code=True)
+    # Warmup iterations
+    for _ in range(10):
+        torch.cuda.synchronize()
+        llm.sync_model_weights(actor_weights=state_dict, load_format='dtensor')
+        torch.cuda.synchronize()
+        dist.barrier()
+    start_time = time.time()
+    llm.sync_model_weights(actor_weights=state_dict, load_format='dtensor')
+    torch.cuda.synchronize()
+    dist.barrier()
+    end_time = time.time()
+    # Calculate elapsed time
+    elapsed_time = end_time - start_time
+    print(f"Time taken: {elapsed_time:.6f} seconds")
+    input_ids = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    idx_list = []
+    batch_size = input_ids.shape[0]
+    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
+    from verl.workers.rollout.vllm_rollout.vllm_rollout import _pre_process_inputs
+    for i in range(batch_size):
+        idx_list.append(_pre_process_inputs(pad_token_id, input_ids[i]))
+    print('start generation')
+    outputs = llm.generate(prompt_token_ids=idx_list, sampling_params=sampling_params, use_tqdm=False)
+    vllm_output = outputs[0].cuda()
+    if torch.distributed.get_rank() == 0:
+        print(f'hf response: {tokenizer.batch_decode(response)}')
+        print(f'vllm response: {tokenizer.batch_decode(vllm_output)}')
+if __name__ == "__main__":
+    main()
--- a/tests/rollout/test_sglang_spmd.py
+++ b/tests/rollout/test_sglang_spmd.py
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from torch.distributed.device_mesh import init_device_mesh
+from sglang.srt.entrypoints.verl_engine import VerlEngine
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import GenerationConfig
+from verl.utils.torch_functional import pad_sequence_to_length
+def levenshtein(s1, s2):
+    m, n = len(s1), len(s2)
+    # Initialize matrix of zeros
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    # Initialize first column and first row of the matrix
+    for i in range(m + 1):
+        dp[i][0] = i  # Deletion from s1 to empty string
+    for j in range(n + 1):
+        dp[0][j] = j  # Insertion to s1 from empty string
+    # Compute the Levenshtein distance matrix
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            cost = 0 if s1[i - 1] == s2[j - 1] else 1  # No cost if characters match
+            dp[i][j] = min(
+                dp[i - 1][j] + 1,  # Deletion
+                dp[i][j - 1] + 1,  # Insertion
+                dp[i - 1][j - 1] + cost  # Substitution
+            )
+    return dp[m][n]
+def are_lists_similar(a, b):
+    if len(a) != len(b):
+        print("The lists are of different lengths.")
+        return False
+    total_length = 0
+    total_diff = 0
+    for s1, s2 in zip(a, b):
+        max_len = max(len(s1), len(s2))
+        total_length += max_len
+        diff = levenshtein(s1, s2)
+        total_diff += diff
+        print(f"Comparing strings:\n{s1}\n{s2}\nDifference: {diff} characters\n")
+    percentage_difference = (total_diff / total_length) * 100
+    print(f"Total difference: {percentage_difference:.2f}%")
+    return percentage_difference <= 10
+def initialize_global_process_group(timeout_second=36000):
+    from datetime import timedelta
+    import torch.distributed
+    # NOTE MODIFIED should provide backend=None to have nccl+gloo
+    # torch.distributed.init_process_group('nccl', timeout=timedelta(seconds=timeout_second))
+    torch.distributed.init_process_group(timeout=timedelta(seconds=timeout_second))
+    local_rank = int(os.environ["LOCAL_RANK"])
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    if torch.distributed.is_initialized():
+        torch.cuda.set_device(local_rank)
+    return local_rank, rank, world_size
+def test_sglang_spmd():
+    assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'
+    initialize_global_process_group()
+    # fill rollout config
+    max_prompt_length = 16
+    max_response_length = 16
+    # Initialize model and token
+    local_cache_path = '~/.cache/verl/rlhf'
+    local_cache_path = os.path.expanduser(local_cache_path)
+    hdfs_path = 'Qwen/Qwen2-7B-Instruct'
+    from verl.utils.fs import copy_to_local
+    local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path, padding_side='left')
+    preencode_prompts = [
+        "Who won the Champions League in 2019?",
+        "The founder of Apple is",
+        "What's your name",
+    ]
+    tokenizer.pad_token = tokenizer.eos_token
+    prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
+    input_ids = prompts['input_ids']
+    attention_mask = prompts['attention_mask']
+    input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
+    attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)
+    actor_model = AutoModelForCausalLM.from_pretrained(local_model_path)
+    actor_model.to(torch.bfloat16)
+    sampling_params = dict(n=1,
+                           temperature=0,
+                           top_p=1,
+                           top_k=-1,
+                           max_new_tokens=max_response_length,
+                           presence_penalty=0.0,
+                           frequency_penalty=0.0,
+                           repetition_penalty=1.0,
+                           skip_special_tokens=True,
+                           spaces_between_special_tokens=True,
+                           ignore_eos=False)
+    tensor_parallel_size = 4
+    device_mesh_kwargs = dict(mesh_shape=(1, tensor_parallel_size, 1), mesh_dim_names=["dp", "tp", "pp"])
+    inference_device_mesh_cpu = init_device_mesh("cpu", **device_mesh_kwargs)
+    for k in ["TORCHELASTIC_USE_AGENT_STORE"]:
+        if k in os.environ:
+            del os.environ[k]
+    print('building sglang rollout engine')
+    llm = VerlEngine(model_path=local_model_path,
+                     dtype="bfloat16",
+                     mem_fraction_static=0.5,
+                     device_mesh_cpu=inference_device_mesh_cpu["tp"],
+                     base_gpu_id=0,
+                     gpu_id_step=1)
+    llm.release_memory_occupation()
+    print("start generation")
+    input_ids = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    batch_size = input_ids.size(0)
+    generation_config = GenerationConfig(do_sample=False)
+    actor_model.cuda()
+    output = actor_model.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        max_new_tokens=max_response_length,
+        # max_length=max_length,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.pad_token_id,
+        generation_config=generation_config,
+        # renormalize_logits=True,
+        output_scores=False,  # this is potentially very large
+        return_dict_in_generate=True,
+        use_cache=False)  # may OOM when use_cache = True
+    seq = output.sequences
+    response = seq[:, max_prompt_length:]
+    hf_response_tokens = tokenizer.batch_decode(response)
+    print(f"hf response: {hf_response_tokens}")
+    print(f"{sampling_params=}")
+    idx_list = []
+    batch_size = input_ids.shape[0]
+    pad_token_id = (tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id)
+    for i in range(batch_size):
+        idx_list.append(_pre_process_inputs(pad_token_id, input_ids[i]))
+    outputs = llm.generate(input_ids=idx_list, sampling_params=sampling_params)
+    sglang_response_tokens = []
+    for output in outputs:
+        print(f"{output=}")
+        generated_text = output["text"]
+        sglang_response_tokens.append(generated_text)
+    print(f"sglang response: {sglang_response_tokens}")
+    assert are_lists_similar(hf_response_tokens, sglang_response_tokens), \
+        f"Strings differ more than 10%:\n"
+    print("Check Pass")
+def _pre_process_inputs(pad_token_id, prompt_token_ids: torch.Tensor):
+    # remove the left padding in the prompt token_id
+    # pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+    non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
+    token_ids = prompt_token_ids[non_pad_index:].tolist()
+    return token_ids
--- a/tests/rollout/test_vllm_hf_loader.py
+++ b/tests/rollout/test_vllm_hf_loader.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import transformers
+from verl.third_party.vllm import LLM, vllm_version
+from verl.utils.model import update_model_config
+from vllm import SamplingParams
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from transformers import GenerationConfig
+from verl.utils.torch_functional import pad_sequence_to_length
+from verl.workers.rollout.vllm_rollout.vllm_rollout import _pre_process_inputs
+def levenshtein(s1, s2):
+    m, n = len(s1), len(s2)
+    # Initialize matrix of zeros
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    # Initialize first column and first row of the matrix
+    for i in range(m + 1):
+        dp[i][0] = i  # Deletion from s1 to empty string
+    for j in range(n + 1):
+        dp[0][j] = j  # Insertion to s1 from empty string
+    # Compute the Levenshtein distance matrix
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            cost = 0 if s1[i - 1] == s2[j - 1] else 1  # No cost if characters match
+            dp[i][j] = min(
+                dp[i - 1][j] + 1,  # Deletion
+                dp[i][j - 1] + 1,  # Insertion
+                dp[i - 1][j - 1] + cost  # Substitution
+            )
+    return dp[m][n]
+def are_lists_similar(a, b):
+    if len(a) != len(b):
+        print("The lists are of different lengths.")
+        return False
+    total_length = 0
+    total_diff = 0
+    for s1, s2 in zip(a, b):
+        max_len = max(len(s1), len(s2))
+        total_length += max_len
+        diff = levenshtein(s1, s2)
+        total_diff += diff
+        print(f"Comparing strings:\n{s1}\n{s2}\nDifference: {diff} characters\n")
+    percentage_difference = (total_diff / total_length) * 100
+    print(f"Total difference: {percentage_difference:.2f}%")
+    return percentage_difference <= 10
+def test_vllm_with_hf():
+    assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'
+    # fill rollout config
+    max_prompt_length = 16
+    max_response_length = 16
+    # Initialize model and token
+    local_cache_path = '~/.cache/verl/rlhf'
+    local_cache_path = os.path.expanduser(local_cache_path)
+    hdfs_path = 'deepseek-ai/deepseek-llm-7b-chat'
+    from verl.utils.fs import copy_to_local
+    local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path)
+    preencode_prompts = [
+        "Who won the Champions League in 2019?",
+        "The founder of Apple is",
+        "What's your name",
+    ]
+    tokenizer.pad_token = tokenizer.eos_token
+    prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
+    input_ids = prompts['input_ids']
+    attention_mask = prompts['attention_mask']
+    input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
+    attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)
+    actor_model = AutoModelForCausalLM.from_pretrained(local_model_path)
+    actor_model.to(torch.bfloat16)
+    actor_model_config = AutoConfig.from_pretrained(local_model_path)
+    temperature = 0
+    top_p = 1
+    kwargs = dict(n=1,
+                  temperature=temperature,
+                  top_p=top_p,
+                  max_tokens=max_response_length,
+                  logprobs=1,
+                  ignore_eos=True)
+    if vllm_version in ('0.4.2', '0.5.4', '0.6.3'):
+        kwargs['detokenize'] = False
+    sampling_params = SamplingParams(**kwargs)
+    tensor_parallel_size = 4
+    llm = LLM(model=actor_model,
+              tokenizer=tokenizer,
+              model_hf_config=actor_model_config,
+              tensor_parallel_size=tensor_parallel_size,
+              dtype='bfloat16',
+              gpu_memory_utilization=0.1,
+              load_format='hf')
+    print('start generation')
+    input_ids = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    batch_size = input_ids.size(0)
+    idx_list = []
+    # parse idx from torch.Tensor to List[List[str]]
+    for i in range(batch_size):
+        idx_list.append(_pre_process_inputs(tokenizer.pad_token_id, input_ids[i]))
+    outputs = llm.generate(prompt_token_ids=idx_list, sampling_params=sampling_params, use_tqdm=False)
+    vllm_output = outputs[0].cuda()
+    llm.free_cache_engine()
+    llm = None
+    import gc
+    torch.cuda.empty_cache()
+    gc.collect()
+    generation_config = GenerationConfig(do_sample=False)
+    actor_model.cuda()
+    output = actor_model.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        max_new_tokens=max_response_length,
+        # max_length=max_length,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.pad_token_id,
+        generation_config=generation_config,
+        # renormalize_logits=True,
+        output_scores=False,  # this is potentially very large
+        return_dict_in_generate=True,
+        use_cache=False)  # may OOM when use_cache = True
+    seq = output.sequences
+    response = seq[:, max_prompt_length:]
+    hf_response_tokens = tokenizer.batch_decode(response)
+    vllm_response_tokens = tokenizer.batch_decode(vllm_output)
+    print(f'hf response: {hf_response_tokens}')
+    print(f'vllm response: {vllm_response_tokens}')
+    assert are_lists_similar(hf_response_tokens, vllm_response_tokens), \
+        f'Strings differ more than 10%:\n'
+    print('Check Pass')
+# if __name__ == "__main__":
+#     test_vllm_with_hf()
--- a/tests/rollout/test_vllm_spmd.py
+++ b/tests/rollout/test_vllm_spmd.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import transformers
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision, CPUOffload
+from torch.distributed.fsdp.api import ShardingStrategy, ShardedStateDictConfig, StateDictType
+from vllm import LLM, SamplingParams
+from verl.utils.model import update_model_config
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from transformers import GenerationConfig
+from verl.utils.distributed import initialize_global_process_group
+from verl.utils.torch_functional import pad_sequence_to_length
+def levenshtein(s1, s2):
+    m, n = len(s1), len(s2)
+    # Initialize matrix of zeros
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    # Initialize first column and first row of the matrix
+    for i in range(m + 1):
+        dp[i][0] = i  # Deletion from s1 to empty string
+    for j in range(n + 1):
+        dp[0][j] = j  # Insertion to s1 from empty string
+    # Compute the Levenshtein distance matrix
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            cost = 0 if s1[i - 1] == s2[j - 1] else 1  # No cost if characters match
+            dp[i][j] = min(
+                dp[i - 1][j] + 1,  # Deletion
+                dp[i][j - 1] + 1,  # Insertion
+                dp[i - 1][j - 1] + cost  # Substitution
+            )
+    return dp[m][n]
+def are_lists_similar(a, b):
+    if len(a) != len(b):
+        print("The lists are of different lengths.")
+        return False
+    total_length = 0
+    total_diff = 0
+    for s1, s2 in zip(a, b):
+        max_len = max(len(s1), len(s2))
+        total_length += max_len
+        diff = levenshtein(s1, s2)
+        total_diff += diff
+        print(f"Comparing strings:\n{s1}\n{s2}\nDifference: {diff} characters\n")
+    percentage_difference = (total_diff / total_length) * 100
+    print(f"Total difference: {percentage_difference:.2f}%")
+    return percentage_difference <= 15
+def test_vllm_spmd():
+    assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'
+    local_rank, rank, world_size = initialize_global_process_group()
+    # Initialize model and token
+    local_cache_path = '~/.cache/verl/rlhf'
+    local_cache_path = os.path.expanduser(local_cache_path)
+    hdfs_path = 'Qwen/Qwen2-7B-Instruct'
+    from verl.utils.fs import copy_to_local
+    local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path, padding_side='left', trust_remote_code=True)
+    actor_model = AutoModelForCausalLM.from_pretrained(local_model_path, trust_remote_code=True)
+    actor_model.to(torch.bfloat16)
+    # fill rollout config
+    max_prompt_length = 16
+    max_response_length = 32
+    preencode_prompts = [
+        "Who won the Champions League in 2019?",
+        "The founder of Apple is",
+        "What's your name",
+    ]
+    tokenizer.pad_token = tokenizer.eos_token
+    prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
+    input_ids = prompts['input_ids']
+    attention_mask = prompts['attention_mask']
+    input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
+    attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)
+    print('start generation')
+    input_ids = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    temperature = 0
+    top_p = 1
+    kwargs = dict(n=1,
+                  temperature=temperature,
+                  top_p=top_p,
+                  max_tokens=max_response_length,
+                  logprobs=1,
+                  ignore_eos=True)
+    tensor_parallel_size = 4
+    from torch.distributed.device_mesh import init_device_mesh
+    device_mesh = init_device_mesh('cuda', mesh_shape=(world_size,), mesh_dim_names=['fsdp'])
+    mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
+    fsdp_model = FSDP(actor_model,
+                      use_orig_params=True,
+                      auto_wrap_policy=None,
+                      device_id=torch.cuda.current_device(),
+                      sharding_strategy=ShardingStrategy.FULL_SHARD,
+                      mixed_precision=mixed_precision,
+                      cpu_offload=CPUOffload(offload_params=False),
+                      sync_module_states=False,
+                      device_mesh=device_mesh)
+    FSDP.set_state_dict_type(fsdp_model,
+                             state_dict_type=StateDictType.SHARDED_STATE_DICT,
+                             state_dict_config=ShardedStateDictConfig())
+    state_dict = fsdp_model.state_dict()
+    sampling_params = SamplingParams(**kwargs)
+    llm = LLM(
+        model=local_model_path,
+        enable_sleep_mode=True,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend="external_launcher",
+        dtype='bfloat16',
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        disable_custom_all_reduce=True,
+        disable_mm_preprocessor_cache=True,
+        skip_tokenizer_init=False,
+        enable_prefix_caching=True,
+        trust_remote_code=True,
+        seed=1,
+    )
+    outputs = llm.generate(preencode_prompts, sampling_params=sampling_params, use_tqdm=False)
+    vllm_response_tokens = []
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        vllm_response_tokens.append(generated_text)
+    world_size = torch.distributed.get_world_size()
+    model = llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
+    model.load_weights(
+        ((name, param.full_tensor() if world_size != 1 else param) for name, param in state_dict.items()))
+    outputs = llm.generate(preencode_prompts, sampling_params=sampling_params, use_tqdm=False)
+    verl_vllm_response_tokens = []
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        verl_vllm_response_tokens.append(generated_text)
+    if torch.distributed.get_rank() == 0:
+        print(f'vllm response: {vllm_response_tokens}')
+        print(f'verl-vllm response: {verl_vllm_response_tokens}')
+    assert are_lists_similar(vllm_response_tokens, verl_vllm_response_tokens), \
+        f'Strings differ more than 10%:\n'
+    print('Check Pass')
+    torch.distributed.destroy_process_group()
+if __name__ == "__main__":
+    test_vllm_spmd()
--- a/tests/sandbox/test_sandbox.py
+++ b/tests/sandbox/test_sandbox.py
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from verl.utils.reward_score import _default_compute_score
+from verl.utils.reward_score.prime_code import apps_check_correctness
+import asyncio
+from verl.workers.reward_manager.prime import parallel_compute_score_async
+prime_math_answers = [
+    """\\begin{bmatrix}\n -7 & 6 & -8 \\\\\n 11 & -9 & 12 \\\\\n 15 & -16 & 19 \n \\end{bmatrix}""",
+    """\\frac{\\sqrt{505}}{7}""", """x^2 + y^2 + 4x - 6y + 13"""
+]
+prime_math_gts = [
+    """\\begin{pmatrix}\n -7 & 6 & -8 \\\\\n 11 & -9 & 12 \\\\\n 15 & -16 & 19\n \\end{pmatrix}""",  # mat test
+    """\\frac{\\sqrt{505}}{7}""",  # frac test
+    """(x + 2)^2 + (y - 3)^2 """  # symbolic test
+]
+prime_code_answers = [
+    """import sys
+from collections import deque
+def main():
+    data = sys.stdin.read().split()
+    it = iter(data)
+    # Read start and target positions
+    x0, y0, x1, y1 = int(next(it)), int(next(it)), int(next(it)), int(next(it))
+    n = int(next(it))
+    allowed = set()
+    # The total number of allowed cells is at most 10^5.
+    for _ in range(n):
+        r = int(next(it))
+        a = int(next(it))
+        b = int(next(it))
+        for c in range(a, b + 1):
+            allowed.add((r, c))
+    # Directions for the king (8 neighboring cells)
+    directions = [(-1, -1), (-1, 0), (-1, 1),
+                  (0, -1),           (0, 1),
+                  (1, -1),  (1, 0),  (1, 1)]
+    start = (x0, y0)
+    target = (x1, y1)
+    # BFS initialization
+    queue = deque()
+    queue.append((x0, y0, 0))
+    # Mark the starting cell as visited by removing it from allowed set.
+    allowed.discard(start)
+    while queue:
+        x, y, moves = queue.popleft()
+        if (x, y) == target:
+            print(moves)
+            return
+        for dx, dy in directions:
+            nx, ny = x + dx, y + dy
+            if (nx, ny) in allowed:
+                allowed.remove((nx, ny))
+                queue.append((nx, ny, moves + 1))
+    print(-1)
+if __name__ == '__main__':
+    main()
+"""
+] * 2
+prime_code_gts = [
+    """{\n \"inputs\": [\n \"5 7 6 11\\n3\\n5 3 8\\n6 7 11\\n5 2 5\\n\",\n \"3 4 3 10\\n3\\n3 1 4\\n4 5 9\\n3 10 10\\n\",\n \"1 1 2 10\\n2\\n1 1 3\\n2 6 10\\n\",\n \"9 8 7 8\\n9\\n10 6 6\\n10 6 6\\n7 7 8\\n9 5 6\\n8 9 9\\n9 5 5\\n9 8 8\\n8 5 6\\n9 10 10\\n\",\n \"6 15 7 15\\n9\\n6 15 15\\n7 14 14\\n6 15 15\\n9 14 14\\n7 14 16\\n6 15 15\\n6 15 15\\n7 14 14\\n8 15 15\\n\",\n \"13 16 20 10\\n18\\n13 16 16\\n20 10 10\\n19 10 10\\n12 15 15\\n20 10 10\\n18 11 11\\n19 10 10\\n19 10 10\\n20 10 10\\n19 10 10\\n20 10 10\\n20 10 10\\n19 10 10\\n18 11 11\\n13 16 16\\n12 15 15\\n19 10 10\\n19 10 10\\n\",\n \"89 29 88 30\\n16\\n87 31 31\\n14 95 95\\n98 88 89\\n96 88 88\\n14 97 97\\n13 97 98\\n100 88 88\\n88 32 32\\n99 88 89\\n90 29 29\\n87 31 31\\n15 94 96\\n89 29 29\\n88 32 32\\n97 89 89\\n88 29 30\\n\",\n \"30 14 39 19\\n31\\n35 7 11\\n37 11 12\\n32 13 13\\n37 5 6\\n46 13 13\\n37 14 14\\n31 13 13\\n43 13 19\\n45 15 19\\n46 13 13\\n32 17 17\\n41 14 19\\n30 14 14\\n43 13 17\\n34 16 18\\n44 11 19\\n38 13 13\\n40 12 20\\n37 16 18\\n46 16 18\\n34 10 14\\n36 9 10\\n36 15 19\\n38 15 19\\n42 13 19\\n33 14 15\\n35 15 19\\n33 17 18\\n39 12 20\\n36 5 7\\n45 12 12\\n\",\n \"2 1 1 1\\n2\\n1 1 2\\n2 1 2\\n\",\n \"1 1 1 2\\n5\\n1000000000 1 10000\\n19920401 1188 5566\\n1000000000 1 10000\\n1 1 10000\\n5 100 200\\n\",\n \"1 1 1000000000 2\\n5\\n1000000000 1 10000\\n19920401 1188 5566\\n1000000000 1 10000\\n1 1 10000\\n5 100 200\\n\"\n ],\n \"outputs\": [\n \"4\\n\",\n \"6\\n\",\n \"-1\\n\",\n \"2\\n\",\n \"1\\n\",\n \"-1\\n\",\n \"1\\n\",\n \"9\\n\",\n \"1\\n\",\n \"1\\n\",\n \"-1\\n\"\n ]\n}""",  # A correct sample
+    """{\n \"inputs\": [\n \"5 7 6 11\\n3\\n5 3 8\\n6 7 11\\n5 2 5\\n\",\n \"3 4 3 10\\n3\\n3 1 4\\n4 5 9\\n3 10 10\\n\",\n \"1 1 2 10\\n2\\n1 1 3\\n2 6 10\\n\",\n \"9 8 7 8\\n9\\n10 6 6\\n10 6 6\\n7 7 8\\n9 5 6\\n8 9 9\\n9 5 5\\n9 8 8\\n8 5 6\\n9 10 10\\n\",\n \"6 15 7 15\\n9\\n6 15 15\\n7 14 14\\n6 15 15\\n9 14 14\\n7 14 16\\n6 15 15\\n6 15 15\\n7 14 14\\n8 15 15\\n\",\n \"13 16 20 10\\n18\\n13 16 16\\n20 10 10\\n19 10 10\\n12 15 15\\n20 10 10\\n18 11 11\\n19 10 10\\n19 10 10\\n20 10 10\\n19 10 10\\n20 10 10\\n20 10 10\\n19 10 10\\n18 11 11\\n13 16 16\\n12 15 15\\n19 10 10\\n19 10 10\\n\",\n \"89 29 88 30\\n16\\n87 31 31\\n14 95 95\\n98 88 89\\n96 88 88\\n14 97 97\\n13 97 98\\n100 88 88\\n88 32 32\\n99 88 89\\n90 29 29\\n87 31 31\\n15 94 96\\n89 29 29\\n88 32 32\\n97 89 89\\n88 29 30\\n\",\n \"30 14 39 19\\n31\\n35 7 11\\n37 11 12\\n32 13 13\\n37 5 6\\n46 13 13\\n37 14 14\\n31 13 13\\n43 13 19\\n45 15 19\\n46 13 13\\n32 17 17\\n41 14 19\\n30 14 14\\n43 13 17\\n34 16 18\\n44 11 19\\n38 13 13\\n40 12 20\\n37 16 18\\n46 16 18\\n34 10 14\\n36 9 10\\n36 15 19\\n38 15 19\\n42 13 19\\n33 14 15\\n35 15 19\\n33 17 18\\n39 12 20\\n36 5 7\\n45 12 12\\n\",\n \"2 1 1 1\\n2\\n1 1 2\\n2 1 2\\n\",\n \"1 1 1 2\\n5\\n1000000000 1 10000\\n19920401 1188 5566\\n1000000000 1 10000\\n1 1 10000\\n5 100 200\\n\",\n \"1 1 1000000000 2\\n5\\n1000000000 1 10000\\n19920401 1188 5566\\n1000000000 1 10000\\n1 1 10000\\n5 100 200\\n\"\n ],\n \"outputs\": [\n \"4\\n\",\n \"6\\n\",\n \"-1\\n\",\n \"-1\\n\",\n \"1\\n\",\n \"-1\\n\",\n \"1\\n\",\n \"9\\n\",\n \"1\\n\",\n \"1\\n\",\n \"-1\\n\"\n ]\n}"""
+]  # A failed sample with first several in-out passed
+prime_code_scores = [1.0, 0.9]
+def test_parallelism():
+    """
+    Test if process pool works properly
+    """
+    sequences_str = []
+    ground_truth = []
+    data_sources = []
+    while len(sequences_str) < 32:
+        sequences_str.extend(prime_code_answers)
+        ground_truth.extend(prime_code_gts)
+        data_sources.extend(['codecontests'] * len(prime_code_answers))
+        sequences_str.extend(prime_math_answers)
+        ground_truth.extend(prime_math_gts)
+        data_sources.extend(['numina_aops_forum'] * len(prime_math_answers))
+    scores = asyncio.run(
+        parallel_compute_score_async(_default_compute_score,
+                                     sequences_str,
+                                     ground_truth,
+                                     data_sources,
+                                     num_processes=16))
+    print(scores)
+def test_prime_code():
+    """
+    Test PRIME code sandbox.
+    """
+    data_source = 'codecontests'
+    for completion, ground_truth, score_ in zip(prime_code_answers, prime_code_gts, prime_code_scores):
+        score = _default_compute_score(data_source, completion, ground_truth)
+        assert float(score) == score_
+def test_check_correctness():
+    completion = prime_code_answers[0]
+    ground_truth = json.loads(prime_code_gts[0])
+    ground_truth_single = {'inputs': ground_truth['inputs'][:1], 'outputs': ground_truth['outputs'][:1]}
+    res, meta = apps_check_correctness(in_outs=ground_truth_single, generation=completion, timeout=5, debug=False)
+    print(res, meta)
+def test_prime_math():
+    data_source = 'numina_aops_forum'
+    for completion, ground_truth in zip(prime_math_answers, prime_math_gts):
+        score = _default_compute_score(data_source, completion, ground_truth)
+        assert float(score) == 1.0
--- a/tests/sanity/check_license.py
+++ b/tests/sanity/check_license.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+license_head_bytedance = "Copyright 2024 Bytedance Ltd. and/or its affiliates"
+license_head_bytedance_25 = "Copyright 2025 Bytedance Ltd. and/or its affiliates"
+# Add custom license headers below
+license_head_prime = "Copyright 2024 PRIME team and/or its affiliates"
+license_head_individual = "Copyright 2025 Individual Contributor:"
+license_headers = [license_head_bytedance, license_head_bytedance_25, license_head_prime, license_head_individual]
+from pathlib import Path
+from argparse import ArgumentParser
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--directory', '-d', required=True, type=str)
+    args = parser.parse_args()
+    directory_in_str = args.directory
+    pathlist = Path(directory_in_str).glob('**/*.py')
+    for path in pathlist:
+        # because path is object not string
+        path_in_str = str(path.absolute())
+        print(path_in_str)
+        with open(path_in_str, 'r', encoding='utf-8') as f:
+            file_content = f.read()
+            has_license = False
+            for lh in license_headers:
+                if lh in file_content:
+                    has_license = True
+                    break
+            assert has_license, f'file {path_in_str} does not contain license'
--- a/tests/sanity/test_import.py
+++ b/tests/sanity/test_import.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def test_import():
+    import verl
+    print(verl.__version__)
+def test_single_controller_import():
+    import verl.single_controller
+    print(verl.single_controller.__version__)