dp_critic.yaml 3.01 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Format checks enforced on CI:
# 1. Comments must appear above each field.
# 2. There must be a blank line between each field.
# 3. Inline comments (after a field on the same line) are not allowed.
# 4. Indentation level is respected for nested fields.

# defaults specify the default config from each component
defaults:

  # dp actor config, inheriting from trainer/config/critic/critic.yaml
  - critic

  # load the reference default config, then apply the fields in the current yaml
  - _self_

# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
_target_: verl.workers.config.FSDPCriticConfig

# distribution strategy. Options: fsdp (deprecating), fsdp2
strategy: fsdp

# optimizer configs
optim:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.FSDPOptimizerConfig

  # Minimum LR ratio for cosine schedule
  min_lr_ratio: null

  # LR warmup style: "constant" or "cosine"
  warmup_style: constant

# model config for the critic
model:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.FSDPCriticModelCfg

  # Whether to use shared memory for loading the model
  use_shm: False

  # Enable gradient checkpointing to save memory
  enable_gradient_checkpointing: True

  # Offload activations to CPU to reduce GPU memory usage
  enable_activation_offload: False

  # Use remove padding optimization (saves compute)
  use_remove_padding: False

  # FSDP-specific config
  fsdp_config:

    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
    _target_: verl.workers.config.FSDPEngineConfig

    # Whether to offload model parameters to CPU
    param_offload: False

    # Whether to offload optimizer state to CPU
    optimizer_offload: False

    # Only for FSDP2: offload param/grad/optimizer during train
    offload_policy: False

    # Only for FSDP2: Reshard after forward pass to reduce memory footprint
    reshard_after_forward: True

    # Policy for wrapping layers with FSDP
    wrap_policy:

      # Minimum number of parameters to trigger wrapping
      min_num_params: 0

    # Number of GPUs in each FSDP shard group; -1 means auto
    fsdp_size: -1

    # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
    # before the current forward computation.
    forward_prefetch: False

  # Set to positive value to enable LoRA (e.g., 32)
  lora_rank: 0

  # LoRA scaling factor
  lora_alpha: 16

  # LoRA target modules: "all-linear" or list of linear projection layers
  target_modules: all-linear

# Forward-only batch size during inference (global)
forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}

# Forward-only batch size during inference (per GPU)
forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}

# Sequence parallelism size for Ulysses-style model parallelism
ulysses_sequence_parallel_size: 1

# Gradient clipping for critic updates
grad_clip: 1.0