Update 0429

c7d1b209 · chenych · c8d12c06 · c7d1b209 · c7d1b209 · c7d1b209
Commit c7d1b209 authored Apr 29, 2025 by chenych
20 changed files
--- a/src/llamafactory/model/model_utils/moe.py
+++ b/src/llamafactory/model/model_utils/moe.py
@@ -12,21 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
-import torch
 from transformers.integrations import is_deepspeed_zero3_enabled
 from ...extras.misc import check_version
 if TYPE_CHECKING:
+    from torch import nn
    from transformers import PretrainedConfig, PreTrainedModel
    from ...hparams import ModelArguments
-def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: list["torch.nn.Module"]) -> None:
+def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: list[Union["nn.Module", str]]) -> None:
    check_version("deepspeed>=0.13.0")
    from deepspeed.utils import set_z3_leaf_modules  # type: ignore
@@ -44,6 +44,19 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
        _set_z3_leaf_modules(model, [DbrxFFN])
+    if model_type == "deepseek_v2":
+        # deepseek v2 uses custom code
+        _set_z3_leaf_modules(model, ["DeepseekV2MoE"])
+    if model_type == "deepseek_v3" or model_type == "kimi_vl":
+        # deepseek v3 and kimi vl use custom code
+        _set_z3_leaf_modules(model, ["DeepseekV3MoE"])
+    if model_type == "granitemoe":
+        from transformers.models.granitemoe.modeling_granitemoe import GraniteMoeMoE
+        _set_z3_leaf_modules(model, [GraniteMoeMoE])
    if model_type == "jamba":
        from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
@@ -54,27 +67,55 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
        _set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])
-    if model_type in ["kimi_vl", "deepseek_v3"]:
+    if model_type == "llama4":
-        check_version("transformers>=4.51.1")
+        from transformers.models.llama4.modeling_llama4 import Llama4TextMoe
-        from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
-        _set_z3_leaf_modules(model, [DeepseekV3MoE])
+        _set_z3_leaf_modules(model, [Llama4TextMoe])
    if model_type == "mixtral":
        from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
        _set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+    if model_type == "olmoe":
+        from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock
+        _set_z3_leaf_modules(model, [OlmoeSparseMoeBlock])
+    if model_type == "phimoe":
+        from transformers.models.phimoe.modeling_phimoe import PhimoeSparseMoeBlock
+        _set_z3_leaf_modules(model, [PhimoeSparseMoeBlock])
    if model_type == "qwen2_moe":
        from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
        _set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
+    if model_type == "qwen3_moe":
+        from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock
+        _set_z3_leaf_modules(model, [Qwen3MoeSparseMoeBlock])
 def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
    model_type = getattr(config, "model_type", None)
    if model_args.moe_aux_loss_coef is not None:
-        if model_type in ["jamba", "mixtral", "qwen2_moe"]:
+        if model_type in [
+            "dbrx",
+            "granitemoe",
+            "jamba",
+            "jetmoe",
+            "llama4",
+            "mixtral",
+            "olmoe",
+            "phimoe",
+            "qwen2_moe",
+            "qwen3_moe",
+        ]:
+            setattr(config, "output_router_logits", is_trainable)
+        if model_type in ["granitemoe", "jamba", "llama4", "mixtral", "olmoe", "phimoe", "qwen2_moe", "qwen3_moe"]:
            setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)
        elif model_type == "deepseek":
@@ -82,6 +123,3 @@ def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_t
        elif model_type == "jetmoe":
            setattr(config, "aux_loss_coef", model_args.moe_aux_loss_coef)
-    if model_type in ["dbrx", "jamba", "jetmoe", "mixtral", "qwen2_moe"]:
-        setattr(config, "output_router_logits", is_trainable)
--- a/src/llamafactory/model/model_utils/packing.py
+++ b/src/llamafactory/model/model_utils/packing.py
@@ -43,12 +43,6 @@ import torch
 import torch.nn.functional as F
 from ...extras import logging
-from ...extras.misc import check_version
-from ...extras.packages import is_transformers_version_greater_than
-if is_transformers_version_greater_than("4.43.0"):
-    import transformers.modeling_flash_attention_utils
 if TYPE_CHECKING:
@@ -117,6 +111,7 @@ def configure_packing(model_args: "ModelArguments", is_trainable: bool) -> None:
    if not is_trainable or not model_args.block_diag_attn:
        return
-    check_version("transformers>=4.43.0")
+    import transformers.modeling_flash_attention_utils
    transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data
    logger.info_rank0("Using block diagonal attention for sequence packing without cross-attention.")
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -122,9 +122,23 @@ def configure_quantization(
        if getattr(config, "model_type", None) == "chatglm":
            raise ValueError("ChatGLM model is not supported yet.")
+        try:
+            from optimum.gptq import utils as gq_utils
+            if "language_model.model.layers" not in gq_utils.BLOCK_PATTERNS:
+                gq_utils.BLOCK_PATTERNS.insert(0, "language_model.model.layers")
+        except ImportError:
+            pass
+        block_name_to_quantize = None
+        if getattr(config, "model_type", None) in ["gemma3", "paligemma"]:
+            block_name_to_quantize = "language_model.model.layers"
        init_kwargs["quantization_config"] = GPTQConfig(
            bits=model_args.export_quantization_bit,
+            tokenizer=tokenizer,
            dataset=_get_quantization_dataset(tokenizer, model_args),
+            block_name_to_quantize=block_name_to_quantize,
        )
        init_kwargs["device_map"] = "auto"
        init_kwargs["max_memory"] = get_max_memory()

--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -198,6 +198,11 @@ def patch_target_modules(
        return target_modules
+_register_composite_model(
+    model_type="internvl",
+)
 _register_composite_model(
    model_type="gemma3",
 )

--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -17,12 +17,12 @@ from typing import TYPE_CHECKING, Any
 import torch
 from peft import PeftModel
-from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.modeling_utils import is_fsdp_enabled
 from ..extras import logging
-from ..extras.misc import infer_optim_dtype, is_env_enabled
+from ..extras.misc import infer_optim_dtype
 from ..extras.packages import is_transformers_version_greater_than
 from .model_utils.attention import configure_attn_implementation, print_attn_implementation
 from .model_utils.checkpointing import prepare_model_for_training
@@ -54,16 +54,22 @@ def patch_tokenizer(tokenizer: "PreTrainedTokenizer", model_args: "ModelArgument
    if model_args.model_max_length is not None and tokenizer.model_max_length < model_args.model_max_length:
        tokenizer.model_max_length = model_args.model_max_length  # enlarge the tokenizer max length
-    if model_args.new_special_tokens is not None:
+    if model_args.add_tokens is not None:
-        num_added_tokens = tokenizer.add_special_tokens(
+        num_added_tokens = tokenizer.add_tokens(new_tokens=model_args.add_tokens, special_tokens=False)
-            dict(additional_special_tokens=model_args.new_special_tokens),
+        logger.info_rank0("Add tokens {} to tokenizer's vocabulary.".format(",".join(model_args.add_tokens)))
-            replace_additional_special_tokens=False,
-        )
-        logger.info_rank0("Add {} to special tokens.".format(",".join(model_args.new_special_tokens)))
        if num_added_tokens > 0 and not model_args.resize_vocab:
            model_args.resize_vocab = True
            logger.warning_rank0("New tokens have been added, changed `resize_vocab` to True.")
+    if model_args.add_special_tokens is not None:
+        num_added_special_tokens = tokenizer.add_tokens(new_tokens=model_args.add_special_tokens, special_tokens=True)
+        logger.info_rank0(
+            "Add special tokens {} to tokenizer's vocabulary.".format(",".join(model_args.add_special_tokens))
+        )
+        if num_added_special_tokens > 0 and not model_args.resize_vocab:
+            model_args.resize_vocab = True
+            logger.warning_rank0("New special tokens have been added, changed `resize_vocab` to True.")
 def patch_processor(
    processor: "ProcessorMixin",
@@ -74,6 +80,7 @@ def patch_processor(
    setattr(processor, "image_max_pixels", model_args.image_max_pixels)
    setattr(processor, "image_min_pixels", model_args.image_min_pixels)
    setattr(processor, "image_do_pan_and_scan", model_args.image_do_pan_and_scan)
+    setattr(processor, "crop_to_patches", model_args.crop_to_patches)
    setattr(processor, "video_max_pixels", model_args.video_max_pixels)
    setattr(processor, "video_min_pixels", model_args.video_min_pixels)
    setattr(processor, "video_fps", model_args.video_fps)
@@ -95,10 +102,6 @@ def patch_config(
        else:
            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
-    if is_torch_npu_available():
-        # avoid JIT compile on NPU devices, see https://zhuanlan.zhihu.com/p/660875458
-        torch.npu.set_compile_mode(jit_compile=is_env_enabled("NPU_JIT_COMPILE"))
    configure_attn_implementation(config, model_args, is_trainable)
    configure_rope(config, model_args, is_trainable)
    configure_longlora(config, model_args, is_trainable)
@@ -121,6 +124,12 @@ def patch_config(
    if getattr(config, "model_type", None) == "kimi_vl" and is_trainable:
        setattr(config.text_config, "topk_method", "greedy")
+    if "InternVLChatModel" in getattr(config, "architectures", []):
+        raise ValueError(
+            "Please download the internvl models in a Hugging Face–compatible format "
+            "(for example, https://huggingface.co/OpenGVLab/InternVL3-8B-hf)."
+        )
    if "LlavaLlamaForCausalLM" in getattr(config, "architectures", []):
        raise ValueError("Please download llava models with hf-compatible format: https://huggingface.co/llava-hf")

--- a/src/llamafactory/third_party/__init__.py
+++ b/src/llamafactory/third_party/__init__.py
--- a/src/llamafactory/third_party/muon/__init__.py
+++ b/src/llamafactory/third_party/muon/__init__.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .muon import Muon
+__all__ = ["Muon"]
--- a/src/llamafactory/third_party/muon/muon.py
+++ b/src/llamafactory/third_party/muon/muon.py
+# Copyright 2025 Moonshot AI and the LlamaFactory team.
+#
+# This code is based on the MoonshotAI's Moonlight library.
+# https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
+# and the Keller Jordan's Muon library.
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2025 Moonshot AI
+# Copyright (c) 2024 Keller Jordan
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+import torch
+def zeropower_via_newtonschulz5(G: "torch.Tensor", steps: int) -> "torch.Tensor":
+    """Newton-Schulz iteration to compute the zeroth power / orthogonalization of G.
+    We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero.
+    For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing
+    the slope at zero even beyond the point where the iteration no longer converges all the way to
+    one everywhere on the interval. This iteration therefore does not produce UV^T but rather something
+    like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G.bfloat16()
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        B = b * A + c * A @ A  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+class Muon(torch.optim.Optimizer):
+    """Muon - MomentUm Orthogonalized by Newton-schulz.
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        lr=1e-3,
+        wd=0.1,
+        muon_params=None,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_params=None,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        params = list(muon_params)
+        adamw_params = list(adamw_params) if adamw_params is not None else []
+        params.extend(adamw_params)
+        super().__init__(params, defaults)
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        for p in muon_params:
+            # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+            assert p.ndim == 2, p.ndim
+            self.state[p]["use_muon"] = True
+        for p in adamw_params:
+            # Do not use Muon for parameters in adamw_params
+            self.state[p]["use_muon"] = False
+    def adjust_lr_for_muon(self, lr: float, param_shape: list[int]) -> float:
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            # Muon loop
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            # generate weight updates in distributed fashion
+            for p in params:
+                # sanity check
+                g = p.grad
+                if g is None:
+                    continue
+                if g.ndim > 2:
+                    g = g.view(g.size(0), -1)
+                assert g is not None
+                # calc update
+                state = self.state[p]
+                if "momentum_buffer" not in state:
+                    state["momentum_buffer"] = torch.zeros_like(g)
+                buf = state["momentum_buffer"]
+                buf.mul_(momentum).add_(g)
+                if group["nesterov"]:
+                    g = g.add(buf, alpha=momentum)
+                else:
+                    g = buf
+                u = zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+                # scale update
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                # apply weight decay
+                p.data.mul_(1 - lr * wd)
+                # apply update
+                p.data.add_(u, alpha=-adjusted_lr)
+            # Adam backup
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss
--- a/src/llamafactory/train/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@@ -188,7 +188,7 @@ class LogCallback(TrainerCallback):
        self.webui_mode = is_env_enabled("LLAMABOARD_ENABLED")
        if self.webui_mode and not use_ray():
            signal.signal(signal.SIGABRT, self._set_abort)
-            self.logger_handler = logging.LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR"))
+            self.logger_handler = logging.LoggerHandler(os.getenv("LLAMABOARD_WORKDIR"))
            logging.add_handler(self.logger_handler)
            transformers.logging.add_handler(self.logger_handler)

--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -63,9 +63,6 @@ def run_dpo(
    else:
        ref_model = None
-    # Update arguments
-    training_args.remove_unused_columns = False  # important for multimodal and pairwise dataset
    # Initialize our Trainer
    trainer = CustomDPOTrainer(
        model=model,

--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
@@ -59,9 +59,6 @@ def run_kto(
    else:
        ref_model = create_ref_model(model_args, finetuning_args)
-    # Update arguments
-    training_args.remove_unused_columns = False  # important for multimodal and pairwise dataset
    # Initialize our Trainer
    trainer = CustomKTOTrainer(
        model=model,

--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -40,6 +40,11 @@ class CustomTrainer(Trainer):
            kwargs["processing_class"] = kwargs.pop("tokenizer")
        super().__init__(**kwargs)
+        if processor is not None:
+            # avoid wrong loss under gradient accumulation
+            # https://github.com/huggingface/transformers/pull/36044#issuecomment-2746657112
+            self.model_accepts_loss_kwargs = False
        self.finetuning_args = finetuning_args
        if processor is not None:

--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -48,9 +48,6 @@ def run_rm(
        template=template, model=model, pad_to_multiple_of=8, **tokenizer_module
    )
-    # Update arguments
-    training_args.remove_unused_columns = False  # important for multimodal and pairwise dataset
    # Initialize our Trainer
    trainer = PairwiseTrainer(
        model=model,

--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -60,6 +60,8 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
        super().__init__(**kwargs)
        if processor is not None:
+            # avoid wrong loss under gradient accumulation
+            # https://github.com/huggingface/transformers/pull/36044#issuecomment-2746657112
            self.model_accepts_loss_kwargs = False
        self.finetuning_args = finetuning_args

--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -20,7 +20,7 @@ from typing import TYPE_CHECKING, Optional
 from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from ...extras.misc import calculate_tps, get_logits_processor
+from ...extras.misc import calculate_tps
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
 from ..trainer_utils import create_modelcard_and_push
@@ -65,11 +65,6 @@ def run_sft(
        **tokenizer_module,
    )
-    # Override the decoding parameters of Seq2SeqTrainer
-    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
-    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
-    training_args.remove_unused_columns = False  # important for multimodal dataset
    # Metric utils
    metric_module = {}
    if training_args.predict_with_generate:
@@ -82,7 +77,6 @@ def run_sft(
    gen_kwargs = generating_args.to_dict(obey_generation_config=True)
    gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
    gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
-    gen_kwargs["logits_processor"] = get_logits_processor()
    # Initialize our Trainer
    trainer = CustomSeq2SeqTrainer(

--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -490,6 +490,35 @@ def _create_adam_mini_optimizer(
    return optimizer
+def _create_muon_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+) -> "torch.optim.Optimizer":
+    from ..third_party.muon import Muon
+    muon_params, adamw_params = [], []
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            # Use Muon for 2D parameters that aren't embeddings or heads
+            if param.ndim == 2 and "embed" not in name and "lm_head" not in name:
+                muon_params.append(param)
+            else:
+                adamw_params.append(param)
+    optimizer = Muon(
+        lr=training_args.learning_rate,
+        wd=training_args.weight_decay,
+        muon_params=muon_params,
+        adamw_params=adamw_params,
+        adamw_betas=(training_args.adam_beta1, training_args.adam_beta2),
+        adamw_eps=training_args.adam_epsilon,
+    )
+    logger.info_rank0(
+        f"Using Muon optimizer with {len(muon_params)} Muon params and {len(adamw_params)} AdamW params."
+    )
+    return optimizer
 def create_custom_optimizer(
    model: "PreTrainedModel",
    training_args: "TrainingArguments",
@@ -510,6 +539,9 @@ def create_custom_optimizer(
    if finetuning_args.use_adam_mini:
        return _create_adam_mini_optimizer(model, training_args)
+    if finetuning_args.use_muon:
+        return _create_muon_optimizer(model, training_args)
 def create_custom_scheduler(
    training_args: "TrainingArguments",
@@ -648,6 +680,12 @@ def get_ray_trainer(
    if ray_args.ray_init_kwargs is not None:
        ray.init(**ray_args.ray_init_kwargs)
+    if ray_args.ray_storage_filesystem is not None:
+        # this means we are using s3/gcs
+        storage_path = ray_args.ray_storage_path
+    else:
+        storage_path = Path(ray_args.ray_storage_path).absolute().as_posix()
    trainer = TorchTrainer(
        training_function,
        train_loop_config=train_loop_config,
@@ -659,7 +697,8 @@ def get_ray_trainer(
        ),
        run_config=RunConfig(
            name=ray_args.ray_run_name,
-            storage_path=Path(ray_args.ray_storage_path).absolute().as_posix(),
+            storage_filesystem=ray_args.ray_storage_filesystem,
+            storage_path=storage_path,
        ),
    )
    return trainer
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Any, Optional
 import torch
 import torch.distributed as dist
-from transformers import PreTrainedModel
+from transformers import EarlyStoppingCallback, PreTrainedModel
 from ..data import get_template_and_fix_tokenizer
 from ..extras import logging
@@ -61,6 +61,9 @@ def _training_function(config: dict[str, Any]) -> None:
    if finetuning_args.use_swanlab:
        callbacks.append(get_swanlab_callback(finetuning_args))
+    if finetuning_args.early_stopping_steps is not None:
+        callbacks.append(EarlyStoppingCallback(early_stopping_patience=finetuning_args.early_stopping_steps))
    callbacks.append(ReporterCallback(model_args, data_args, finetuning_args, generating_args))  # add to last
    if finetuning_args.stage == "pt":

--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -77,10 +77,10 @@ class WebChatModel(ChatModel):
        if not lazy_init:  # read arguments from command line
            super().__init__()
-        if demo_mode and os.environ.get("DEMO_MODEL") and os.environ.get("DEMO_TEMPLATE"):  # load demo model
+        if demo_mode and os.getenv("DEMO_MODEL") and os.getenv("DEMO_TEMPLATE"):  # load demo model
-            model_name_or_path = os.environ.get("DEMO_MODEL")
+            model_name_or_path = os.getenv("DEMO_MODEL")
-            template = os.environ.get("DEMO_TEMPLATE")
+            template = os.getenv("DEMO_TEMPLATE")
-            infer_backend = os.environ.get("DEMO_BACKEND", "huggingface")
+            infer_backend = os.getenv("DEMO_BACKEND", "huggingface")
            super().__init__(
                dict(model_name_or_path=model_name_or_path, template=template, infer_backend=infer_backend)
            )

--- a/src/llamafactory/webui/control.py
+++ b/src/llamafactory/webui/control.py
@@ -56,11 +56,11 @@ def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
    Inputs: top.quantization_method
    Outputs: top.quantization_bit
    """
-    if quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
+    if quantization_method == QuantizationMethod.BNB:
        available_bits = ["none", "8", "4"]
-    elif quantization_method == QuantizationMethod.HQQ.value:
+    elif quantization_method == QuantizationMethod.HQQ:
        available_bits = ["none", "8", "6", "5", "4", "3", "2", "1"]
-    elif quantization_method == QuantizationMethod.EETQ.value:
+    elif quantization_method == QuantizationMethod.EETQ:
        available_bits = ["none", "8"]
    return gr.Dropdown(choices=available_bits)

--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -23,7 +23,7 @@ from transformers.trainer import TRAINING_ARGS_NAME
 from transformers.utils import is_torch_npu_available
 from ..extras.constants import LLAMABOARD_CONFIG, PEFT_METHODS, TRAINING_STAGES
-from ..extras.misc import is_gpu_or_npu_available, torch_gc, use_ray
+from ..extras.misc import is_accelerator_available, torch_gc, use_ray
 from ..extras.packages import is_gradio_available
 from .common import (
    DEFAULT_CACHE_DIR,
@@ -108,7 +108,7 @@ class Runner:
            if not get("eval.output_dir"):
                return ALERTS["err_no_output_dir"][lang]
-        if not from_preview and not is_gpu_or_npu_available():
+        if not from_preview and not is_accelerator_available():
            gr.Warning(ALERTS["warn_no_cuda"][lang])
        return ""