# configs for the reward model # Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions. # In GSM8K and Math examples, we disable reward model. # For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses. # If False, the following parameters are not effective enable: False # FSDP strategy: "fsdp" or "fsdp2" strategy: ??? # model config for reward scoring model: # Input tokenizer. If the reward model's chat template is inconsistent with the policy, # we need to first decode to plaintext, then apply the rm's chat_template. # Then score with RM. If chat_templates are consistent, it can be set to null. # set this to null if the chat template is identical input_tokenizer: ${actor_rollout_ref.model.path} # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification. # Other model types need to define their own RewardModelWorker and pass it from the code. path: ~/models/FsfairX-LLaMA3-RM-v0.1 # External model implementation (optional) external_lib: ${actor_rollout_ref.model.external_lib} # Whether to enable loading a remote code model, default to False trust_remote_code: False # [Deprecated] Global micro batch size # will be deprecated, use micro_batch_size_per_gpu micro_batch_size: null # Local per-GPU micro batch size micro_batch_size_per_gpu: null # Maximum sequence length to process for scoring max_length: null # Whether to dynamically adjust batch size at runtime use_dynamic_bsz: ${critic.use_dynamic_bsz} # Maximum number of tokens per GPU in one forward pass forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} # Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources. # Default is naive. If all verification functions are multiprocessing-safe, # the reward manager can be set to prime for parallel verification. reward_manager: naive # Whether to launch custom reward function asynchronously during log_prob # custom reward function executed async on CPU, during log_prob launch_reward_fn_async: False # Cloud/local sandbox fusion configuration for custom reward logic sandbox_fusion: # Cloud /local function URL for sandbox execution url: null # Max concurrent requests allowed to sandbox max_concurrent: 64 # Max memory limit for each sandbox process in MB memory_limit_mb: 1024 # profiler configs profiler: # hint for the target config dataclass _target_: verl.utils.profiler.ProfilerConfig # True for each task has its own database, False for all tasks in one training step share one database. discrete: False # Whether to profile all ranks. all_ranks: False # The ranks that will be profiled. [] or [0,1,...] ranks: []