# actor_rollout_ref.rollout.name: hf/vllm/sglang. The default value will be removed in the future name: ??? # sync: LLM, async: AsyncLLM mode: sync # Sampling temperature for rollout. temperature: 1.0 # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout. top_k: -1 # Top-p sampling parameter. Default 1.0. top_p: 1 # typically the same as data max prompt length # same as data.max_prompt_length if it exists prompt_length: ${oc.select:data.max_prompt_length,512} # typically the same as data max response length # same as data.max_response_length if it exists response_length: ${oc.select:data.max_response_length,512} # for vllm rollout # Rollout model parameters type. Align with actor model's FSDP/Megatron type. dtype: bfloat16 # Fraction of GPU memory used by vLLM/SGLang for KV cache. gpu_memory_utilization: 0.5 # Whether to ignore EOS and continue generating after EOS is hit. ignore_eos: False # Whether to disable CUDA graph. Default True to allow cache freeing. enforce_eager: True # Whether to free engine KVCache after generation. Set enforce_eager=True when enabled. free_cache_engine: True # TP size for rollout. Not effective for hf tensor_model_parallel_size: 2 # max number of tokens in a batch max_num_batched_tokens: 8192 # max length for rollout max_model_len: null # max length of sequences max_num_seqs: 1024 # [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size. log_prob_micro_batch_size: null # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU. log_prob_micro_batch_size_per_gpu: null # enable dynamic batch size (sequence packing) for log_prob computation # same as actor_rollout_ref.actor.use_dynamic_bsz if it exists, otherwise false log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} # max token length for log_prob computation # same as actor_rollout_ref.actor.ppo_max_token_len_per_gpu if it exists, otherwise 16384 log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} # disable logging statistics disable_log_stats: True # for hf rollout # Whether to sample during training rollout. False uses greedy sampling. do_sample: True # number of responses (i.e. num sample times). > 1 for grpo n: 1 # Whether to wake up inference engine in multi-stage to reduce peak memory during training-rollout transition. multi_stage_wake_up: false # Extra inference engine arguments (vllm, sglang). engine_kwargs: # for vllm vllm: # Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB). swap_space: null # Whether to disable the preprocessor cache for multimodel models. disable_mm_preprocessor_cache: False # for sglang sglang: # The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default. attention_backend: null # Sampling parameters used during validation. val_kwargs: # sampling parameters for validation # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout. top_k: -1 # Top-p sampling parameter. Default 1.0. top_p: 1.0 # Sampling temperature for rollout. temperature: 0 # whether to repeat n times for validation n: 1 # Whether to sample during training rollout. False uses greedy sampling. do_sample: False # Multi-turn interaction config for tools or chat. multi_turn: # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well enable: False # null for no limit (default max_length // 3) max_assistant_turns: null # null for no tool tool_config_path: null # null for no limit (default max_length // 3) max_user_turns: null # max parallel call for tools in single turn max_parallel_calls: 1 # max length of tool response max_tool_response_length: 256 # truncate side of tool response: left, middle, right tool_response_truncate_side: middle # null for no interaction interaction_config_path: null # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior. # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output, # which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts. use_inference_chat_template: False # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation. # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids. # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them. # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template: # Qwen/QwQ-32B, Qwen/Qwen3-xxB # - disable: disable tokenization sanity check # - strict: enable strict tokenization sanity check (default) # - ignore_strippable: ignore strippable tokens when checking tokenization sanity tokenization_sanity_check_mode: strict # Format of the multi-turn interaction. Options: hermes, llama3_json, ... format: hermes # support logging rollout prob for debugging purpose calculate_log_probs: False # [Experimental] agent loop based rollout configs agent: # Number of agent loop workers num_workers: 8 # custom agent loop config path, which should contain list of configs to intialize AgentLoop instances. # https://hydra.cc/docs/advanced/instantiate_objects/overview/ # # - name: react_agent # _target_: recipe.langgraph_agent.react_agent_loop.ReactAgentLoop # tools: ["get_current_temperature"] # - name: math_expression # _target_: recipe.langgraph_agent.example.math_expression.MathExpressionReactAgentLoop # min_terms: 2 # max_terms: 6 agent_loop_config_path: null # custom async server configs custom_async_server: # Path to the custom async server implementation path: null # Class name of the custom async server class (e.g. AsyncvLLMServer) name: null # Specifies the tensor bucket size (in megabytes) for batch weight updates during rollout operations. # This parameter controls the maximum payload size for a single weight update request. # Reference: https://github.com/volcengine/verl/pull/2418 # Currently only supported in SGLang rollout implementations # Larger values may improve throughput but increase memory overhead # Detailed performance comparison: # https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/issues/169#issuecomment-3070686720 # Default value (512MB) is optimized for typical GPU memory configurations # For the best performance of `rebuild_cuda_tensor`, it is recommended to: # 1. Enable `RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES` # 2. Manually set `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7` # when using Tensor Parallelism (TP) >= 8. update_weights_bucket_megabytes: 512 # trace rollout data trace: # trace backend, support mlflow, weave backend: null # whether translate token id to text in output token2text: False