First commit.

f92481f0 · chenych · 7121d0b0 · f92481f0 · f92481f0 · f92481f0
Commit f92481f0 authored Mar 04, 2025 by chenych
20 changed files
--- a/verl/trainer/config.py
+++ b/verl/trainer/config.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PPO config
+"""
+import os
+from dataclasses import asdict, dataclass, field, fields, is_dataclass
+from typing import Optional, Tuple
+from verl.workers.config import WorkerConfig
+def recursive_post_init(dataclass_obj):
+    if hasattr(dataclass_obj, "post_init"):
+        dataclass_obj.post_init()
+    for attr in fields(dataclass_obj):
+        if is_dataclass(getattr(dataclass_obj, attr.name)):
+            recursive_post_init(getattr(dataclass_obj, attr.name))
+@dataclass
+class DataConfig:
+    train_files: str = ""
+    val_files: str = ""
+    prompt_key: str = "prompt"
+    max_prompt_length: int = 512
+    max_response_length: int = 512
+    rollout_batch_size: int = 512
+    return_raw_input_ids: bool = False
+    return_raw_prompt: bool = False
+    system_prompt: str = r"Please reason step by step, and put your final answer within \boxed{}."
+    shuffle: bool = True
+    seed: int = 1
+    max_pixels: int = 4194304
+    min_pixels: int = 262144
+@dataclass
+class AlgorithmConfig:
+    gamma: float = 1.0
+    lam: float = 1.0
+    adv_estimator: str = "gae"
+    kl_penalty: str = "kl"
+    kl_type: str = "fixed"
+    kl_coef: float = 1e-3
+    kl_horizon: float = 0.0
+    kl_target: float = 0.0
+@dataclass
+class TrainerConfig:
+    total_episodes: int = 10
+    max_steps: Optional[int] = None
+    project_name: str = "easy_r1"
+    experiment_name: str = "demo"
+    logger: Tuple[str] = ("console", "wandb")
+    val_generations_to_log_to_wandb: int = 0
+    nnodes: int = 1
+    n_gpus_per_node: int = 8
+    save_freq: int = -1
+    load_checkpoint_path: Optional[str] = None
+    val_before_train: bool = True
+    val_only: bool = False
+    test_freq: int = -1
+    critic_warmup: int = 0
+    remove_previous_ckpt: bool = False
+    del_local_ckpt_after_load: bool = False
+    save_checkpoint_path: Optional[str] = None
+    def post_init(self):
+        if self.save_checkpoint_path is None:
+            self.save_checkpoint_path = os.path.join("checkpoints", self.project_name, self.experiment_name)
+@dataclass
+class PPOConfig:
+    data: DataConfig = field(default_factory=DataConfig)
+    worker: WorkerConfig = field(default_factory=WorkerConfig)
+    algorithm: AlgorithmConfig = field(default_factory=AlgorithmConfig)
+    trainer: TrainerConfig = field(default_factory=TrainerConfig)
+    def post_init(self):
+        self.worker.rollout.prompt_length = self.data.max_prompt_length
+        self.worker.rollout.response_length = self.data.max_response_length
+    def deep_post_init(self):
+        recursive_post_init(self)
+    def to_dict(self):
+        return asdict(self)
--- a/verl/trainer/core_algos.py
+++ b/verl/trainer/core_algos.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Core functions to implement PPO algorithms.
+The function implemented in this file should be used by trainer with different distributed strategies to
+implement PPO
+"""
+from collections import defaultdict
+from typing import TYPE_CHECKING, Tuple
+import numpy as np
+import torch
+import verl.utils.torch_functional as verl_F
+if TYPE_CHECKING:
+    from verl.trainer.config import AlgorithmConfig
+class AdaptiveKLController:
+    """
+    Adaptive KL controller described in the paper:
+    https://arxiv.org/pdf/1909.08593.pdf
+    """
+    def __init__(self, init_kl_coef: float, target_kl: float, horizon: float):
+        self.value = init_kl_coef
+        self.target = target_kl
+        self.horizon = horizon
+    def update(self, current_kl, n_steps):
+        target = self.target
+        proportional_error = np.clip(current_kl / target - 1, -0.2, 0.2)
+        mult = 1 + proportional_error * n_steps / self.horizon
+        self.value *= mult
+class FixedKLController:
+    """Fixed KL controller."""
+    def __init__(self, kl_coef: float):
+        self.value = kl_coef
+    def update(self, current_kl, n_steps):
+        pass
+def get_kl_controller(algorithm_config: "AlgorithmConfig"):
+    if algorithm_config.kl_type == "fixed":
+        kl_ctrl = FixedKLController(kl_coef=algorithm_config.kl_coef)
+    elif algorithm_config.kl_type == "adaptive":
+        assert algorithm_config.kl_horizon > 0, f"horizon must be larger than 0. Got {algorithm_config.kl_horizon}."
+        kl_ctrl = AdaptiveKLController(
+            init_kl_coef=algorithm_config.kl_coef,
+            target_kl=algorithm_config.kl_target,
+            horizon=algorithm_config.kl_horizon,
+        )
+    else:
+        raise ValueError("Unknown kl_ctrl type")
+    return kl_ctrl
+def compute_gae_advantage_return(
+    token_level_rewards: torch.Tensor,
+    values: torch.Tensor,
+    eos_mask: torch.Tensor,
+    gamma: torch.Tensor,
+    lam: torch.Tensor,
+):
+    """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        values: `(torch.Tensor)`
+            shape: (bs, response_length)
+        eos_mask: `(torch.Tensor)`
+            shape: (bs, response_length). [EOS] mask. The token after [EOS] have mask zero.
+        gamma: `(float)`
+            discounted factor used in RL
+        lam: `(float)`
+            lambda value when computing Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    with torch.no_grad():
+        lastgaelam = 0
+        advantages_reversed = []
+        gen_len = token_level_rewards.shape[-1]
+        for t in reversed(range(gen_len)):
+            nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
+            delta = token_level_rewards[:, t] + gamma * nextvalues - values[:, t]
+            lastgaelam = delta + gamma * lam * lastgaelam
+            advantages_reversed.append(lastgaelam)
+        advantages = torch.stack(advantages_reversed[::-1], dim=1)
+        returns = advantages + values
+        advantages = verl_F.masked_whiten(advantages, eos_mask)
+    return advantages, returns
+# NOTE(sgm): this implementation only consider outcome supervision, where the reward is a scalar.
+def compute_grpo_outcome_advantage(
+    token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, index: torch.Tensor, epsilon: float = 1e-6
+):
+    """
+    Compute advantage for GRPO, operating only on Outcome reward
+    (with only one scalar reward for each response).
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        eos_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    response_length = token_level_rewards.shape[-1]
+    scores = token_level_rewards.sum(dim=-1)
+    id2score = defaultdict(list)
+    id2mean = {}
+    id2std = {}
+    with torch.no_grad():
+        bsz = scores.shape[0]
+        for i in range(bsz):
+            id2score[index[i]].append(scores[i])
+        for idx in id2score:
+            if len(id2score[idx]) == 1:
+                id2mean[idx] = torch.tensor(0.0)
+                id2std[idx] = torch.tensor(1.0)
+            elif len(id2score[idx]) > 1:
+                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
+                id2std[idx] = torch.std(torch.tensor([id2score[idx]]))
+            else:
+                raise ValueError(f"no score in prompt index: {idx}")
+        for i in range(bsz):
+            scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)
+        scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask
+    return scores, scores
+def compute_reinforce_plus_plus_outcome_advantage(
+    token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, gamma: torch.Tensor
+):
+    """
+    Compute advantage for REINFORCE++.
+    This implementation is based on the paper: https://arxiv.org/abs/2501.03262
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        eos_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    with torch.no_grad():
+        returns = torch.zeros_like(token_level_rewards)
+        running_return = 0
+        for t in reversed(range(token_level_rewards.shape[1])):
+            running_return = token_level_rewards[:, t] + gamma * running_return
+            returns[:, t] = running_return
+            # Reset after EOS
+            running_return = running_return * eos_mask[:, t]
+        advantages = verl_F.masked_whiten(returns, eos_mask)
+        advantages = advantages * eos_mask
+    return advantages, returns
+def compute_remax_outcome_advantage(
+    token_level_rewards: torch.Tensor, reward_baselines: torch.Tensor, eos_mask: torch.Tensor
+):
+    """
+    Compute advantage for ReMax, operating only on Outcome reward
+    This implementation is based on the paper: https://arxiv.org/abs/2310.10505
+    (with only one scalar reward for each response).
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        reward_baselines: `(torch.Tensor)`
+            shape: (bs,)
+        eos_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        Returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    response_length = token_level_rewards.shape[-1]
+    # scores = token_level_rewards.sum(dim=-1)
+    with torch.no_grad():
+        returns = (token_level_rewards * eos_mask).flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
+        advantages = returns - reward_baselines.unsqueeze(-1).tile([1, response_length]) * eos_mask
+    return advantages, returns
+def compute_rewards(token_level_scores, old_log_prob, ref_log_prob, kl_ratio):
+    kl = old_log_prob - ref_log_prob
+    return token_level_scores - kl * kl_ratio
+def compute_policy_loss(
+    old_log_prob, log_prob, advantages, eos_mask, cliprange
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1122
+    Args:
+        old_log_prob: `(torch.Tensor)`
+            shape: (bs, response_length)
+        log_prob: `(torch.Tensor)`
+            shape: (bs, response_length)
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        eos_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        cliprange: (float)
+            The clip range used in PPO. See https://arxiv.org/abs/1707.06347
+    Returns:
+        pg_loss: `a scalar torch.Tensor`
+            policy gradient loss computed via PPO
+        pg_clipfrac: (float)
+            a float number indicating the fraction of policy gradient loss being clipped
+    """
+    negative_approx_kl = log_prob - old_log_prob
+    ratio = torch.exp(negative_approx_kl)
+    ppo_kl = verl_F.masked_mean(-negative_approx_kl, eos_mask)
+    pg_losses = -advantages * ratio
+    pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - cliprange, 1.0 + cliprange)
+    pg_loss = verl_F.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask)
+    pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask)
+    return pg_loss, pg_clipfrac, ppo_kl
+def compute_entropy_loss(logits, eos_mask):
+    """Compute Categorical entropy loss
+    Args:
+        logits: `(torch.Tensor)`
+            shape: (bs, response_length, vocab_size)
+        eos_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+    Returns:
+        entropy: a scalar torch.Tensor
+    """
+    # compute entropy
+    entropy = verl_F.entropy_from_logits(logits)  # (bs, response_len)
+    entropy_loss = verl_F.masked_mean(entropy, mask=eos_mask)
+    return entropy_loss
+def compute_value_loss(vpreds, returns, values, eos_mask, cliprange_value):
+    """Compute the value loss. Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1151
+    Args:
+        vpreds (`torch.FloatTensor`):
+            Predicted values of the value head, shape (`batch_size`, `response_length`)
+        values (`torch.FloatTensor`):
+            Old values of value head, shape (`batch_size`, `response_length`)
+        returns: (`torch.FloatTensor`):
+            Ground truth returns, shape (`batch_size`, `response_length`)
+    Returns:
+        vf_loss: a scalar (`torch.FloatTensor`):
+            value function loss
+        vf_clipfrac: a float
+            The ratio of vf being clipped
+    """
+    vpredclipped = verl_F.clip_by_value(vpreds, values - cliprange_value, values + cliprange_value)
+    vf_losses1 = (vpreds - returns) ** 2
+    vf_losses2 = (vpredclipped - returns) ** 2
+    vf_loss = 0.5 * verl_F.masked_mean(torch.max(vf_losses1, vf_losses2), eos_mask)
+    vf_clipfrac = verl_F.masked_mean(torch.gt(vf_losses2, vf_losses1).float(), eos_mask)
+    return vf_loss, vf_clipfrac
+def kl_penalty(logprob: torch.FloatTensor, ref_logprob: torch.FloatTensor, kl_penalty) -> torch.Tensor:
+    """Compute KL divergence given logprob and ref_logprob.
+    Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1104
+    Args:
+        logprob:
+        ref_logprob:
+    Returns:
+    """
+    if kl_penalty == "kl":
+        return logprob - ref_logprob
+    if kl_penalty == "abs":
+        return (logprob - ref_logprob).abs()
+    if kl_penalty == "mse":
+        return 0.5 * (logprob - ref_logprob).square()
+    # J. Schulman. Approximating kl divergence, 2020.
+    # # URL http://joschu.net/blog/kl-approx.html.
+    if kl_penalty == "low_var_kl":
+        kl = ref_logprob - logprob
+        ratio = torch.exp(kl)
+        kld = (ratio - kl - 1).contiguous()
+        return torch.clamp(kld, min=-10, max=10)
+    if kl_penalty == "full":
+        # so, here logprob and ref_logprob should contain the logits for every token in vocabulary
+        raise NotImplementedError
+    raise NotImplementedError
--- a/verl/trainer/main.py
+++ b/verl/trainer/main.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+import json
+import ray
+from omegaconf import OmegaConf
+from verl.single_controller.ray import RayWorkerGroup
+from verl.trainer.config import PPOConfig
+from verl.trainer.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role
+from verl.utils import get_processor, get_tokenizer
+from verl.workers.fsdp_workers import FSDPWorker
+from verl.workers.reward import CustomRewardManager
+def main():
+    cli_args = OmegaConf.from_cli()
+    file_config = OmegaConf.load(cli_args.config)
+    del cli_args.config
+    default_config = OmegaConf.structured(PPOConfig())
+    ppo_config = OmegaConf.merge(default_config, file_config, cli_args)
+    ppo_config = OmegaConf.to_object(ppo_config)
+    if not ray.is_initialized():
+        # this is for local ray cluster
+        ray.init(runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}})
+    ray.get(main_task.remote(ppo_config))
+@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
+def main_task(config: PPOConfig):
+    config.deep_post_init()
+    print(json.dumps(config.to_dict(), indent=2))
+    # instantiate tokenizer
+    tokenizer = get_tokenizer(config.worker.actor.model.model_path)
+    processor = get_processor(config.worker.actor.model.model_path, use_fast=True)
+    # define worker classes
+    ray_worker_group_cls = RayWorkerGroup
+    role_worker_mapping = {
+        Role.ActorRollout: ray.remote(FSDPWorker),
+        Role.Critic: ray.remote(FSDPWorker),
+        Role.RefPolicy: ray.remote(FSDPWorker),
+    }
+    global_pool_id = "global_pool"
+    resource_pool_spec = {
+        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+    }
+    mapping = {
+        Role.ActorRollout: global_pool_id,
+        Role.Critic: global_pool_id,
+        Role.RefPolicy: global_pool_id,
+    }
+    reward_fn = CustomRewardManager(
+        tokenizer=tokenizer, num_examine=1, compute_score=config.worker.reward.compute_score
+    )
+    val_reward_fn = CustomRewardManager(
+        tokenizer=tokenizer, num_examine=1, compute_score=config.worker.reward.compute_score
+    )
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+    trainer = RayPPOTrainer(
+        config=config,
+        tokenizer=tokenizer,
+        processor=processor,
+        role_worker_mapping=role_worker_mapping,
+        resource_pool_manager=resource_pool_manager,
+        ray_worker_group_cls=ray_worker_group_cls,
+        reward_fn=reward_fn,
+        val_reward_fn=val_reward_fn,
+    )
+    trainer.init_workers()
+    trainer.fit()
+if __name__ == "__main__":
+    main()
--- a/verl/trainer/ray_trainer.py
+++ b/verl/trainer/ray_trainer.py
--- a/verl/utils/__init__.py
+++ b/verl/utils/__init__.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .tokenizer import get_processor, get_tokenizer
+__all__ = ["get_processor", "get_tokenizer"]
--- a/verl/utils/checkpoint/__init__.py
+++ b/verl/utils/checkpoint/__init__.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/verl/utils/checkpoint/checkpoint_manager.py
+++ b/verl/utils/checkpoint/checkpoint_manager.py
--- a/verl/utils/checkpoint/fsdp_checkpoint_manager.py
+++ b/verl/utils/checkpoint/fsdp_checkpoint_manager.py
--- a/verl/utils/flops_counter.py
+++ b/verl/utils/flops_counter.py
--- a/verl/utils/fsdp_utils.py
+++ b/verl/utils/fsdp_utils.py
--- a/verl/utils/logger/__init__.py
+++ b/verl/utils/logger/__init__.py
--- a/verl/utils/logger/aggregate_logger.py
+++ b/verl/utils/logger/aggregate_logger.py
--- a/verl/utils/model_utils.py
+++ b/verl/utils/model_utils.py
--- a/verl/utils/performance.py
+++ b/verl/utils/performance.py
--- a/verl/utils/py_functional.py
+++ b/verl/utils/py_functional.py
--- a/verl/utils/reward_score/__init__.py
+++ b/verl/utils/reward_score/__init__.py
--- a/verl/utils/reward_score/math.py
+++ b/verl/utils/reward_score/math.py
--- a/verl/utils/reward_score/r1v.py
+++ b/verl/utils/reward_score/r1v.py
--- a/verl/utils/rl_dataset.py
+++ b/verl/utils/rl_dataset.py
--- a/verl/utils/tokenizer.py
+++ b/verl/utils/tokenizer.py