reward_model.yaml 2.76 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# configs for the reward model

# Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
# In GSM8K and Math examples, we disable reward model.
# For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
# If False, the following parameters are not effective
enable: False

# FSDP strategy: "fsdp" or "fsdp2"
strategy: ???

# model config for reward scoring
model:

  # Input tokenizer. If the reward model's chat template is inconsistent with the policy,
  # we need to first decode to plaintext, then apply the rm's chat_template.
  # Then score with RM. If chat_templates are consistent, it can be set to null.
  # set this to null if the chat template is identical
  input_tokenizer: ${actor_rollout_ref.model.path}

  # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
  # Other model types need to define their own RewardModelWorker and pass it from the code.
  path: ~/models/FsfairX-LLaMA3-RM-v0.1

  # External model implementation (optional)
  external_lib: ${actor_rollout_ref.model.external_lib}

  # Whether to enable loading a remote code model, default to False
  trust_remote_code: False

# [Deprecated] Global micro batch size
# will be deprecated, use micro_batch_size_per_gpu
micro_batch_size: null

# Local per-GPU micro batch size
micro_batch_size_per_gpu: null

# Maximum sequence length to process for scoring
max_length: null

# Whether to dynamically adjust batch size at runtime
use_dynamic_bsz: ${critic.use_dynamic_bsz}

# Maximum number of tokens per GPU in one forward pass
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

# Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
# Default is naive. If all verification functions are multiprocessing-safe,
# the reward manager can be set to prime for parallel verification.
reward_manager: naive

# Whether to launch custom reward function asynchronously during log_prob
# custom reward function executed async on CPU, during log_prob
launch_reward_fn_async: False

# Cloud/local sandbox fusion configuration for custom reward logic
sandbox_fusion:

  # Cloud /local function URL for sandbox execution
  url: null

  # Max concurrent requests allowed to sandbox
  max_concurrent: 64

  # Max memory limit for each sandbox process in MB
  memory_limit_mb: 1024

# profiler configs
profiler:

  # hint for the target config dataclass
  _target_: verl.utils.profiler.ProfilerConfig

  # True for each task has its own database, False for all tasks in one training step share one database.
  discrete: False

  # Whether to profile all ranks.
  all_ranks: False

  # The ranks that will be profiled. [] or [0,1,...]
  ranks: []