rollout.yaml 7.3 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# actor_rollout_ref.rollout.name: hf/vllm/sglang. The default value will be removed in the future
name: ???

# sync: LLM, async: AsyncLLM
mode: sync

# Sampling temperature for rollout.
temperature: 1.0

# Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
top_k: -1

# Top-p sampling parameter. Default 1.0.
top_p: 1

# typically the same as data max prompt length
# same as data.max_prompt_length if it exists
prompt_length: ${oc.select:data.max_prompt_length,512}

# typically the same as data max response length
# same as data.max_response_length if it exists
response_length: ${oc.select:data.max_response_length,512}

# for vllm rollout
# Rollout model parameters type. Align with actor model's FSDP/Megatron type.
dtype: bfloat16

# Fraction of GPU memory used by vLLM/SGLang for KV cache.
gpu_memory_utilization: 0.5

# Whether to ignore EOS and continue generating after EOS is hit.
ignore_eos: False

# Whether to disable CUDA graph. Default True to allow cache freeing.
enforce_eager: True

# Whether to free engine KVCache after generation. Set enforce_eager=True when enabled.
free_cache_engine: True

# TP size for rollout. Not effective for hf
tensor_model_parallel_size: 2

# max number of tokens in a batch
max_num_batched_tokens: 8192

# max length for rollout
max_model_len: null

# max length of sequences
max_num_seqs: 1024

# [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size.
log_prob_micro_batch_size: null

# The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
log_prob_micro_batch_size_per_gpu: null

# enable dynamic batch size (sequence packing) for log_prob computation
# same as actor_rollout_ref.actor.use_dynamic_bsz if it exists, otherwise false
log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}

# max token length for log_prob computation
# same as actor_rollout_ref.actor.ppo_max_token_len_per_gpu if it exists, otherwise 16384
log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}

# disable logging statistics
disable_log_stats: True

# for hf rollout
# Whether to sample during training rollout. False uses greedy sampling.
do_sample: True

# number of responses (i.e. num sample times). > 1 for grpo
n: 1

# Whether to wake up inference engine in multi-stage to reduce peak memory during training-rollout transition.
multi_stage_wake_up: false

# Extra inference engine arguments (vllm, sglang).
engine_kwargs:

  # for vllm
  vllm:

    # Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
    swap_space: null

    # Whether to disable the preprocessor cache for multimodel models.
    disable_mm_preprocessor_cache: False

  # for sglang
  sglang:

    # The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
    attention_backend: null

# Sampling parameters used during validation.
val_kwargs:

  # sampling parameters for validation
  # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
  top_k: -1

  # Top-p sampling parameter. Default 1.0.
  top_p: 1.0

  # Sampling temperature for rollout.
  temperature: 0

  # whether to repeat n times for validation
  n: 1

  # Whether to sample during training rollout. False uses greedy sampling.
  do_sample: False

# Multi-turn interaction config for tools or chat.
multi_turn:

  # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
  enable: False

  # null for no limit (default max_length // 3)
  max_assistant_turns: null

  # null for no tool
  tool_config_path: null

  # null for no limit (default max_length // 3)
  max_user_turns: null

  # max parallel call for tools in single turn
  max_parallel_calls: 1

  # max length of tool response
  max_tool_response_length: 256

  # truncate side of tool response: left, middle, right
  tool_response_truncate_side: middle

  # null for no interaction
  interaction_config_path: null

  # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
  # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
  #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
  use_inference_chat_template: False

  # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
  # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
  # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
  # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
  # Qwen/QwQ-32B, Qwen/Qwen3-xxB
  # - disable: disable tokenization sanity check
  # - strict: enable strict tokenization sanity check (default)
  # - ignore_strippable: ignore strippable tokens when checking tokenization sanity
  tokenization_sanity_check_mode: strict

  # Format of the multi-turn interaction. Options: hermes, llama3_json, ...
  format: hermes

# support logging rollout prob for debugging purpose
calculate_log_probs: False

# [Experimental] agent loop based rollout configs
agent:

  # Number of agent loop workers
  num_workers: 8

  # custom agent loop config path, which should contain list of configs to intialize AgentLoop instances.
  # https://hydra.cc/docs/advanced/instantiate_objects/overview/
  #
  # - name: react_agent
  #   _target_: recipe.langgraph_agent.react_agent_loop.ReactAgentLoop
  #   tools: ["get_current_temperature"]
  # - name: math_expression
  #   _target_: recipe.langgraph_agent.example.math_expression.MathExpressionReactAgentLoop
  #   min_terms: 2
  #   max_terms: 6
  agent_loop_config_path: null

  # custom async server configs
  custom_async_server:

    # Path to the custom async server implementation
    path: null

    # Class name of the custom async server class (e.g. AsyncvLLMServer)
    name: null

# Specifies the tensor bucket size (in megabytes) for batch weight updates during rollout operations.
# This parameter controls the maximum payload size for a single weight update request.
# Reference: https://github.com/volcengine/verl/pull/2418
# Currently only supported in SGLang rollout implementations
# Larger values may improve throughput but increase memory overhead
# Detailed performance comparison:
# https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/issues/169#issuecomment-3070686720
# Default value (512MB) is optimized for typical GPU memory configurations
# For the best performance of `rebuild_cuda_tensor`, it is recommended to:
# 1. Enable `RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES`
# 2. Manually set `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
# when using Tensor Parallelism (TP) >= 8.
update_weights_bucket_megabytes: 512

# trace rollout data
trace:
  
  # trace backend, support mlflow, weave
  backend: null

  # whether translate token id to text in output
  token2text: False