# Copyright 2024 Bytedance Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass from typing import Optional from omegaconf import MISSING from verl.base_config import BaseConfig __all__ = ["OptimizerConfig", "FSDPOptimizerConfig", "McoreOptimizerConfig"] @dataclass class OptimizerConfig(BaseConfig): """Base optimizer configuration. Args: lr (float): learning rate. Must be specified. lr_warmup_steps_ratio (float): Warmup steps ratio; total steps will be injected at runtime. total_training_steps (int): Total training steps (must be overridden at runtime). weight_decay (float): Weight decay factor. lr_warmup_steps (Optional[int]): Number of warmup steps; None delegates to lr_warmup_steps_ratio. """ lr: float = MISSING lr_warmup_steps_ratio: float = 0.0 total_training_steps: int = -1 weight_decay: float = 0.01 lr_warmup_steps: Optional[int] = -1 def __post_init__(self): assert self.lr != MISSING @dataclass class FSDPOptimizerConfig(OptimizerConfig): """FSDP optimizer configuration extending base OptimizerConfig. Args: lr (float): Learning rate. min_lr_ratio (Optional[float]): Minimum LR ratio for cosine schedule. warmup_style (str): LR warmup style: "constant" or "cosine". num_cycles (float): Number of cosine cycles in LR schedule. """ min_lr_ratio: Optional[float] = None warmup_style: str = "constant" num_cycles: float = 0.5 def __post_init__(self): assert self.warmup_style in ["constant", "cosine"] return super().__post_init__() @dataclass class McoreOptimizerConfig(OptimizerConfig): """Mcore optimizer configuration extending base OptimizerConfig. Args: optimizer (str): Optimizer name; default is "adam". lr (float): Learning rate. clip_grad (float): Gradient clipping norm. lr_warmup_init (float): Initial learning rate for warmup; defaults to 0.0. lr_decay_steps (Optional[int]): Number of decay steps. lr_decay_style (str): LR decay style: "constant", "linear", "cosine", or "inverse_square_root". min_lr (float): Minimum learning rate. weight_decay_incr_style (str): Weight decay increment style: "constant" or "cosine". lr_wsd_decay_style (str): Weight-standard-deviation decay style: "constant", "exponential", or "cosine". lr_wsd_decay_steps (Optional[int]): Number of steps for weight-standard-deviation decay. use_checkpoint_opt_param_scheduler (bool): Whether to use checkpoint optimizer parameter scheduler. """ optimizer: str = "adam" clip_grad: float = 1.0 lr_warmup_init: float = 0.0 lr_decay_steps: Optional[int] = None lr_decay_style: str = "linear" min_lr: float = 0.0 weight_decay_incr_style: str = "constant" lr_wsd_decay_style: str = "exponential" lr_wsd_decay_steps: Optional[int] = None use_checkpoint_opt_param_scheduler: bool = False