Add QWQ-32B

317a82e2 · chenych · 37b0ad9f · 317a82e2 · 317a82e2 · 37b0ad9f
Commit 317a82e2 authored Mar 07, 2025 by chenych
20 changed files
--- a/scripts/api_example/test_image.py
+++ b/scripts/api_example/test_image.py
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2025 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/scripts/api_example/test_toolcall.py
+++ b/scripts/api_example/test_toolcall.py
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2025 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
-# Copyright 2024 Microsoft Corporation and the LlamaFactory team.
-#
-# This code is inspired by the Microsoft's DeepSpeed library.
-# https://www.deepspeed.ai/tutorials/flops-profiler/
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import fire
-import torch
-from deepspeed.accelerator import get_accelerator  # type: ignore
-from deepspeed.profiling.flops_profiler import get_model_profile  # type: ignore
-from llamafactory.chat import ChatModel
-def calculate_flops(
-    model_name_or_path: str,
-    batch_size: int = 1,
-    seq_length: int = 512,
-    flash_attn: str = "auto",
-):
-    r"""
-    Calculates the flops of pre-trained models.
-    Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
-    """
-    with get_accelerator().device(0):
-        chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
-        fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.engine.model.device)
-        input_dict = {"input_ids": fake_input, "labels": fake_input.clone()}
-        flops, macs, params = get_model_profile(
-            chat_model.engine.model, kwargs=input_dict, print_profile=True, detailed=True
-        )
-        print("FLOPs:", flops)
-        print("MACs:", macs)
-        print("Params:", params)
-if __name__ == "__main__":
-    fire.Fire(calculate_flops)
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
-# Copyright 2024 imoneoi and the LlamaFactory team.
-#
-# This code is inspired by the imoneoi's OpenChat library.
-# https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import Literal
-import fire
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
-from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
-from llamafactory.extras.constants import IGNORE_INDEX
-from llamafactory.hparams import get_train_args
-from llamafactory.model import load_tokenizer
-BASE_LR = 3e-4  # 1.5e-4 for 30B-70B models
-BASE_BS = 4_000_000  # from llama paper
-def calculate_lr(
-    model_name_or_path: str,
-    batch_size: int,  # total batch size, namely (batch size * gradient accumulation * world size)
-    stage: Literal["pt", "sft"] = "sft",
-    dataset: str = "alpaca_en_demo",
-    dataset_dir: str = "data",
-    template: str = "default",
-    cutoff_len: int = 1024,  # i.e. maximum input length during training
-    is_mistral_or_gemma: bool = False,  # mistral and gemma models opt for a smaller learning rate,
-    packing: bool = False,
-):
-    r"""
-    Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
-    Usage:
-    python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en_demo --cutoff_len 1024 --batch_size 16
-    """
-    model_args, data_args, training_args, _, _ = get_train_args(
-        dict(
-            stage=stage,
-            model_name_or_path=model_name_or_path,
-            dataset=dataset,
-            dataset_dir=dataset_dir,
-            template=template,
-            cutoff_len=cutoff_len,
-            packing=packing,
-            output_dir="dummy_dir",
-            overwrite_cache=True,
-            do_train=True,
-        )
-    )
-    tokenizer_module = load_tokenizer(model_args)
-    tokenizer = tokenizer_module["tokenizer"]
-    template = get_template_and_fix_tokenizer(tokenizer, data_args)
-    trainset = get_dataset(template, model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
-    if stage == "pt":
-        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-    elif stage == "sft":
-        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
-    else:
-        raise NotImplementedError(f"Stage does not supported: {stage}.")
-    dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
-    valid_tokens, total_tokens = 0, 0
-    for batch in tqdm(dataloader):
-        valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item()
-        total_tokens += torch.numel(batch["labels"])
-    batch_max_len = cutoff_len * batch_size  # max tokens in a batch
-    valid_ratio = valid_tokens / total_tokens
-    batch_valid_len = batch_max_len * valid_ratio
-    lr = BASE_LR * math.sqrt(batch_valid_len / BASE_BS)  # lr ~ sqrt(batch_size)
-    lr = lr / 6.0 if is_mistral_or_gemma else lr
-    print(
-        "Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective batch size {:.2f}".format(
-            lr, valid_ratio * 100, batch_valid_len
-        )
-    )
-if __name__ == "__main__":
-    fire.Fire(calculate_lr)
--- a/scripts/cal_mfu.py
+++ b/scripts/cal_mfu.py
-# Copyright 2024 the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-import fire
-import torch
-import torch.distributed as dist
-from transformers import AutoConfig
-from llamafactory.train.tuner import run_exp
-BASE = 2  # gemm (add + mul)
-def compute_model_flops(
-    model_name_or_path: str,
-    total_batch_size: int,
-    seq_length: int,
-    include_backward: bool = True,
-    include_recompute: bool = False,
-    include_flashattn: bool = False,
-) -> int:
-    r"""
-    Calculates the FLOPs of model per forward/backward pass.
-    """
-    config = AutoConfig.from_pretrained(model_name_or_path)
-    hidden_size = getattr(config, "hidden_size", None)
-    vocab_size = getattr(config, "vocab_size", None)
-    intermediate_size = getattr(config, "intermediate_size", None)
-    num_attention_heads = getattr(config, "num_attention_heads", None)
-    num_key_value_heads = getattr(config, "num_key_value_heads", None)
-    num_hidden_layers = getattr(config, "num_hidden_layers", None)
-    tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
-    # mlp module
-    mlp_flops_per_token = 3 * BASE * hidden_size * intermediate_size  # up, gate, down
-    mlp_flops = total_batch_size * seq_length * num_hidden_layers * mlp_flops_per_token
-    # attn projector module
-    q_flops_per_token = BASE * hidden_size * hidden_size
-    o_flops_per_token = BASE * hidden_size * hidden_size
-    k_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
-    v_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
-    attn_proj_flops_per_token = q_flops_per_token + o_flops_per_token + k_flops_per_token + v_flops_per_token
-    attn_proj_flops = total_batch_size * seq_length * num_hidden_layers * attn_proj_flops_per_token
-    # attn sdpa module
-    sdpa_flops_per_layer = 2 * BASE * hidden_size * seq_length * seq_length  # (q * k^T) * v
-    sdpa_flops = total_batch_size * num_hidden_layers * sdpa_flops_per_layer
-    # embedding module
-    embedding_flops_per_token = hidden_size * vocab_size
-    embedding_flops = total_batch_size * seq_length * embedding_flops_per_token
-    if tie_word_embeddings is False:
-        embedding_flops *= 2
-    non_embedding_flops = mlp_flops + attn_proj_flops + sdpa_flops
-    non_embedding_coeff, embedding_coeff = 1, 1
-    if include_backward:
-        non_embedding_coeff += 2
-        embedding_coeff += 2
-    if include_recompute:
-        non_embedding_coeff += 1
-    total_flops = non_embedding_coeff * non_embedding_flops + embedding_coeff * embedding_flops
-    if include_flashattn:
-        total_flops += sdpa_flops
-    return total_flops
-def compute_device_flops(world_size: int) -> float:
-    r"""
-    Calculates the FLOPs of the device capability per second.
-    """
-    device_name = torch.cuda.get_device_name()
-    if "H100" in device_name or "H800" in device_name:
-        return 989 * 1e12 * world_size
-    elif "A100" in device_name or "A800" in device_name:
-        return 312 * 1e12 * world_size
-    elif "V100" in device_name:
-        return 125 * 1e12 * world_size
-    elif "4090" in device_name:
-        return 98 * 1e12 * world_size
-    else:
-        raise NotImplementedError(f"Device not supported: {device_name}.")
-def calculate_mfu(
-    model_name_or_path: str,
-    batch_size: int = 1,
-    seq_length: int = 1024,
-    num_steps: int = 100,
-    finetuning_type: str = "lora",
-    flash_attn: str = "auto",
-    deepspeed_stage: int = 0,
-    disable_gc: bool = False,
-    liger_kernel: bool = False,
-    unsloth_gc: bool = False,
-) -> float:
-    r"""
-    Calculates MFU for given model and hyper-params.
-    Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024
-    """
-    args = {
-        "model_name_or_path": model_name_or_path,
-        "flash_attn": flash_attn,
-        "disable_gradient_checkpointing": disable_gc,
-        "enable_liger_kernel": liger_kernel,
-        "use_unsloth_gc": unsloth_gc,
-        "stage": "pt",
-        "do_train": True,
-        "finetuning_type": finetuning_type,
-        "dataset": "c4_demo",
-        "cutoff_len": seq_length,
-        "output_dir": os.path.join("saves", "test_mfu"),
-        "logging_strategy": "no",
-        "save_strategy": "no",
-        "save_only_model": True,
-        "overwrite_output_dir": True,
-        "per_device_train_batch_size": batch_size,
-        "max_steps": num_steps,
-        "bf16": True,
-    }
-    if deepspeed_stage in [2, 3]:
-        args["deepspeed"] = f"examples/deepspeed/ds_z{deepspeed_stage}_config.json"
-    run_exp(args)
-    with open(os.path.join("saves", "test_mfu", "all_results.json"), encoding="utf-8") as f:
-        result = json.load(f)
-    if dist.is_initialized():
-        world_size = dist.get_world_size()
-    else:
-        world_size = 1
-    total_batch_size = batch_size * world_size
-    mfu_value = (
-        result["train_steps_per_second"]
-        * compute_model_flops(model_name_or_path, total_batch_size, seq_length)
-        / compute_device_flops(world_size)
-    )
-    print(f"MFU: {mfu_value * 100:.2f}%")
-if __name__ == "__main__":
-    fire.Fire(calculate_mfu)
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
-# Copyright 2024 the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-from dataclasses import dataclass
-from typing import Any, Dict, Literal, Optional, Sequence
-import fire
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
-from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
-from llamafactory.extras.constants import IGNORE_INDEX
-from llamafactory.hparams import get_train_args
-from llamafactory.model import load_model, load_tokenizer
-@dataclass
-class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
-    r"""
-    Data collator for pairwise data.
-    """
-    train_on_prompt: bool = False
-    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
-        r"""
-        Pads batched data to the longest sequence in the batch.
-        We generate 2 * n examples where the first n examples represent chosen examples and
-        the last n examples represent rejected examples.
-        """
-        chosen_features = []
-        for feature in features:
-            prompt_len, answer_len = len(feature["prompt_ids"]), len(feature["chosen_ids"])
-            input_ids = feature["prompt_ids"] + feature["chosen_ids"]
-            attention_mask = [1] * (prompt_len + answer_len)
-            labels = input_ids if self.train_on_prompt else [IGNORE_INDEX] * prompt_len + feature["chosen_ids"]
-            chosen_features.append({"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels})
-        return super().__call__(chosen_features)
-def calculate_ppl(
-    model_name_or_path: str,
-    save_name: str,
-    batch_size: int = 4,
-    stage: Literal["pt", "sft", "rm"] = "sft",
-    dataset: str = "alpaca_en_demo",
-    dataset_dir: str = "data",
-    template: str = "default",
-    cutoff_len: int = 1024,
-    max_samples: Optional[int] = None,
-    train_on_prompt: bool = False,
-):
-    r"""
-    Calculates the ppl on the dataset of the pre-trained models.
-    Usage: python cal_ppl.py --model_name_or_path path_to_model --dataset alpaca_en_demo --save_name ppl.json
-    """
-    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
-        dict(
-            stage=stage,
-            model_name_or_path=model_name_or_path,
-            dataset=dataset,
-            dataset_dir=dataset_dir,
-            template=template,
-            cutoff_len=cutoff_len,
-            max_samples=max_samples,
-            train_on_prompt=train_on_prompt,
-            output_dir="dummy_dir",
-            overwrite_cache=True,
-            do_train=True,
-        )
-    )
-    tokenizer_module = load_tokenizer(model_args)
-    tokenizer = tokenizer_module["tokenizer"]
-    template = get_template_and_fix_tokenizer(tokenizer, data_args)
-    trainset = get_dataset(template, model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
-    model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False)
-    if stage == "pt":
-        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-    elif stage == "sft":
-        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
-    elif stage == "rm":
-        data_collator = PairwiseDataCollatorWithPadding(
-            tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt
-        )
-    else:
-        raise NotImplementedError(f"Stage does not supported: {stage}.")
-    dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
-    criterion = torch.nn.CrossEntropyLoss(reduction="none")
-    total_ppl = 0
-    perplexities = []
-    batch: Dict[str, "torch.Tensor"]
-    with torch.no_grad():
-        for batch in tqdm(dataloader):
-            batch = batch.to(model.device)
-            outputs = model(**batch)
-            shift_logits: "torch.Tensor" = outputs["logits"][..., :-1, :]
-            shift_labels: "torch.Tensor" = batch["labels"][..., 1:]
-            loss_mask = shift_labels != IGNORE_INDEX
-            flatten_logits = shift_logits.contiguous().view(shift_labels.size(0) * shift_labels.size(1), -1)
-            flatten_labels = shift_labels.contiguous().view(-1)
-            token_logps: "torch.Tensor" = criterion(flatten_logits, flatten_labels)
-            token_logps = token_logps.contiguous().view(shift_logits.size(0), -1)
-            sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-            total_ppl += sentence_logps.exp().sum().item()
-            perplexities.extend(sentence_logps.exp().tolist())
-    with open(save_name, "w", encoding="utf-8") as f:
-        json.dump(perplexities, f, indent=2)
-    print(f"Average perplexity is {total_ppl / len(perplexities):.2f}")
-    print(f"Perplexities have been saved at {save_name}.")
-if __name__ == "__main__":
-    fire.Fire(calculate_ppl)
--- a/scripts/convert_ckpt/llamafy_baichuan2.py
+++ b/scripts/convert_ckpt/llamafy_baichuan2.py
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2025 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,15 +19,10 @@ from typing import Any, Dict
 import fire
 import torch
+from huggingface_hub import split_torch_state_dict_into_shards
 from safetensors.torch import save_file
 from tqdm import tqdm
-from transformers.modeling_utils import (
+from transformers.modeling_utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    shard_checkpoint,
-)
 CONFIG_NAME = "config.json"
@@ -40,34 +35,42 @@ def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetenso
            shard_weight = torch.load(os.path.join(input_dir, filepath), map_location="cpu")
            baichuan2_state_dict.update(shard_weight)
-    llama2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    llama_state_dict: Dict[str, torch.Tensor] = OrderedDict()
    for key, value in tqdm(baichuan2_state_dict.items(), desc="Convert format"):
        if "W_pack" in key:
            proj_size = value.size(0) // 3
-            llama2_state_dict[key.replace("W_pack", "q_proj")] = value[:proj_size, :]
+            llama_state_dict[key.replace("W_pack", "q_proj")] = value[:proj_size, :]
-            llama2_state_dict[key.replace("W_pack", "k_proj")] = value[proj_size : 2 * proj_size, :]
+            llama_state_dict[key.replace("W_pack", "k_proj")] = value[proj_size : 2 * proj_size, :]
-            llama2_state_dict[key.replace("W_pack", "v_proj")] = value[2 * proj_size :, :]
+            llama_state_dict[key.replace("W_pack", "v_proj")] = value[2 * proj_size :, :]
        elif "lm_head" in key:
-            llama2_state_dict[key] = torch.nn.functional.normalize(value)
+            llama_state_dict[key] = torch.nn.functional.normalize(value)
        else:
-            llama2_state_dict[key] = value
+            llama_state_dict[key] = value
    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
-    shards, index = shard_checkpoint(llama2_state_dict, max_shard_size=shard_size, weights_name=weights_name)
+    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+    state_dict_split = split_torch_state_dict_into_shards(
-    for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
+        llama_state_dict, filename_pattern=filename_pattern, max_shard_size=shard_size
+    )
+    for shard_file, tensors in tqdm(state_dict_split.filename_to_tensors.items(), desc="Save weights"):
+        shard = {tensor: llama_state_dict[tensor].contiguous() for tensor in tensors}
        if save_safetensors:
            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
        else:
            torch.save(shard, os.path.join(output_dir, shard_file))
-    if index is None:
+    if not state_dict_split.is_sharded:
-        print(f"Model weights saved in {os.path.join(output_dir, WEIGHTS_NAME)}")
+        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}.")
    else:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
            json.dump(index, f, indent=2, sort_keys=True)
-        print(f"Model weights saved in {output_dir}")
+        print(f"Model weights saved in {output_dir}.")
 def save_config(input_dir: str, output_dir: str):
@@ -81,6 +84,7 @@ def save_config(input_dir: str, output_dir: str):
    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
        json.dump(llama2_config_dict, f, indent=2)
    print(f"Model config saved in {os.path.join(output_dir, CONFIG_NAME)}")

--- a/scripts/convert_ckpt/llamafy_qwen.py
+++ b/scripts/convert_ckpt/llamafy_qwen.py
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2025 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,16 +19,11 @@ from typing import Any, Dict
 import fire
 import torch
+from huggingface_hub import split_torch_state_dict_into_shards
 from safetensors import safe_open
 from safetensors.torch import save_file
 from tqdm import tqdm
-from transformers.modeling_utils import (
+from transformers.modeling_utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    shard_checkpoint,
-)
 from transformers.utils import check_min_version
@@ -49,60 +44,68 @@ def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetenso
                for key in f.keys():
                    qwen_state_dict[key] = f.get_tensor(key)
-    llama2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    llama_state_dict: Dict[str, torch.Tensor] = OrderedDict()
    torch_dtype = None
    for key, value in tqdm(qwen_state_dict.items(), desc="Convert format"):
        if torch_dtype is None:
            torch_dtype = value.dtype
        if "wte" in key:
-            llama2_state_dict["model.embed_tokens.weight"] = value
+            llama_state_dict["model.embed_tokens.weight"] = value
        elif "ln_f" in key:
-            llama2_state_dict["model.norm.weight"] = value
+            llama_state_dict["model.norm.weight"] = value
        else:
            key = key.replace("transformer.h", "model.layers")
            if "attn.c_attn" in key:
                proj_size = value.size(0) // 3
-                llama2_state_dict[key.replace("attn.c_attn", "self_attn.q_proj")] = value[:proj_size, ...]
+                llama_state_dict[key.replace("attn.c_attn", "self_attn.q_proj")] = value[:proj_size, ...]
-                llama2_state_dict[key.replace("attn.c_attn", "self_attn.k_proj")] = value[
+                llama_state_dict[key.replace("attn.c_attn", "self_attn.k_proj")] = value[
                    proj_size : 2 * proj_size, ...
                ]
-                llama2_state_dict[key.replace("attn.c_attn", "self_attn.v_proj")] = value[2 * proj_size :, ...]
+                llama_state_dict[key.replace("attn.c_attn", "self_attn.v_proj")] = value[2 * proj_size :, ...]
            elif "attn.c_proj" in key:
-                llama2_state_dict[key.replace("attn.c_proj", "self_attn.o_proj")] = value
+                llama_state_dict[key.replace("attn.c_proj", "self_attn.o_proj")] = value
-                llama2_state_dict[key.replace("attn.c_proj.weight", "self_attn.o_proj.bias")] = torch.zeros_like(
+                llama_state_dict[key.replace("attn.c_proj.weight", "self_attn.o_proj.bias")] = torch.zeros_like(
                    value[:, 0]
                ).squeeze()
            elif "ln_1" in key:
-                llama2_state_dict[key.replace("ln_1", "input_layernorm")] = value
+                llama_state_dict[key.replace("ln_1", "input_layernorm")] = value
            elif "ln_2" in key:
-                llama2_state_dict[key.replace("ln_2", "post_attention_layernorm")] = value
+                llama_state_dict[key.replace("ln_2", "post_attention_layernorm")] = value
            elif "mlp.w1" in key:
-                llama2_state_dict[key.replace("mlp.w1", "mlp.up_proj")] = value
+                llama_state_dict[key.replace("mlp.w1", "mlp.up_proj")] = value
            elif "mlp.w2" in key:
-                llama2_state_dict[key.replace("mlp.w2", "mlp.gate_proj")] = value
+                llama_state_dict[key.replace("mlp.w2", "mlp.gate_proj")] = value
            elif "mlp.c_proj" in key:
-                llama2_state_dict[key.replace("mlp.c_proj", "mlp.down_proj")] = value
+                llama_state_dict[key.replace("mlp.c_proj", "mlp.down_proj")] = value
            elif "lm_head" in key:
-                llama2_state_dict[key] = value
+                llama_state_dict[key] = value
            else:
                raise KeyError(f"Unable to process key {key}")
    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
-    shards, index = shard_checkpoint(llama2_state_dict, max_shard_size=shard_size, weights_name=weights_name)
+    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+    state_dict_split = split_torch_state_dict_into_shards(
-    for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
+        llama_state_dict, filename_pattern=filename_pattern, max_shard_size=shard_size
+    )
+    for shard_file, tensors in tqdm(state_dict_split.filename_to_tensors.items(), desc="Save weights"):
+        shard = {tensor: llama_state_dict[tensor].contiguous() for tensor in tensors}
        if save_safetensors:
            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
        else:
            torch.save(shard, os.path.join(output_dir, shard_file))
-    if index is None:
+    if not state_dict_split.is_sharded:
-        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}")
+        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}.")
    else:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
            json.dump(index, f, indent=2, sort_keys=True)
-        print(f"Model weights saved in {output_dir}")
+        print(f"Model weights saved in {output_dir}.")
    return str(torch_dtype).replace("torch.", "")
@@ -134,6 +137,7 @@ def save_config(input_dir: str, output_dir: str, torch_dtype: str):
    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
        json.dump(llama2_config_dict, f, indent=2)
    print(f"Model config saved in {os.path.join(output_dir, CONFIG_NAME)}")

--- a/scripts/length_cdf.py
+++ b/scripts/length_cdf.py
-# Copyright 2024 the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from collections import defaultdict
-import fire
-from tqdm import tqdm
-from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
-from llamafactory.hparams import get_train_args
-from llamafactory.model import load_tokenizer
-def length_cdf(
-    model_name_or_path: str,
-    dataset: str = "alpaca_en_demo",
-    dataset_dir: str = "data",
-    template: str = "default",
-    interval: int = 1000,
-):
-    r"""
-    Calculates the distribution of the input lengths in the dataset.
-    Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
-    """
-    model_args, data_args, training_args, _, _ = get_train_args(
-        dict(
-            stage="sft",
-            model_name_or_path=model_name_or_path,
-            dataset=dataset,
-            dataset_dir=dataset_dir,
-            template=template,
-            cutoff_len=1_000_000,
-            output_dir="dummy_dir",
-            overwrite_cache=True,
-            do_train=True,
-        )
-    )
-    tokenizer_module = load_tokenizer(model_args)
-    template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
-    trainset = get_dataset(template, model_args, data_args, training_args, "sft", **tokenizer_module)["train_dataset"]
-    total_num = len(trainset)
-    length_dict = defaultdict(int)
-    for sample in tqdm(trainset["input_ids"]):
-        length_dict[len(sample) // interval * interval] += 1
-    length_tuples = list(length_dict.items())
-    length_tuples.sort()
-    count_accu, prob_accu = 0, 0
-    for length, count in length_tuples:
-        count_accu += count
-        prob_accu += count / total_num * 100
-        print(f"{count_accu:d} ({prob_accu:.2f}%) samples have length < {length + interval}.")
-if __name__ == "__main__":
-    fire.Fire(length_cdf)
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
-# Copyright 2024 Tencent Inc. and the LlamaFactory team.
+# Copyright 2025 Tencent Inc. and the LlamaFactory team.
 #
 # This code is inspired by the Tencent's LLaMA-Pro library.
 # https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
@@ -18,20 +18,15 @@
 import json
 import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict
 import fire
 import torch
+from huggingface_hub import split_torch_state_dict_into_shards
 from safetensors.torch import save_file
 from tqdm import tqdm
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
-from transformers.modeling_utils import (
+from transformers.modeling_utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    shard_checkpoint,
-)
 if TYPE_CHECKING:
@@ -46,41 +41,36 @@ def block_expansion(
    model_name_or_path: str,
    output_dir: str,
    num_expand: int,
-    shard_size: str = "2GB",
+    shard_size: str = "5GB",
    save_safetensors: bool = True,
 ):
    r"""
-    Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
+    Performs block expansion for LLaMA, Mistral, Qwen2 or Yi models.
    Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
    """
-    config: "PretrainedConfig" = AutoConfig.from_pretrained(model_name_or_path)
+    config: "PretrainedConfig" = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
    num_layers = getattr(config, "num_hidden_layers")
+    if num_layers % num_expand != 0:
+        raise ValueError(f"`num_layers` {num_layers} should be divisible by `num_expand` {num_expand}.")
    setattr(config, "num_hidden_layers", num_layers + num_expand)
    config.save_pretrained(output_dir)
-    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
    tokenizer.save_pretrained(output_dir)
-    config = AutoConfig.from_pretrained(model_name_or_path)  # load the original one
+    print(f"Expanding model of {num_layers} layers to {num_layers + num_expand} layers.")
-    if save_safetensors:
-        setattr(config, "tie_word_embeddings", False)  # safetensors does not allow shared weights
    model = AutoModelForCausalLM.from_pretrained(
-        model_name_or_path,
+        model_name_or_path, torch_dtype="auto", device_map="cpu", trust_remote_code=True, low_cpu_mem_usage=True
-        config=config,
-        torch_dtype="auto",
-        trust_remote_code=True,
-        low_cpu_mem_usage=True,
    )
    assert isinstance(model, PreTrainedModel)  # type hint
-    state_dict = model.state_dict()
+    if save_safetensors and getattr(model.config, "tie_word_embeddings", False):
+        del model.lm_head  # safetensors does not allow shared weights
-    if num_layers % num_expand != 0:
-        raise ValueError(f"`num_layers` {num_layers} should be divisible by `num_expand` {num_expand}.")
    split = num_layers // num_expand
    layer_cnt = 0
-    output_state_dict = OrderedDict()
+    state_dict = model.state_dict()
+    output_state_dict: Dict[str, "torch.Tensor"] = OrderedDict()
    for i in range(num_layers):
        for key, value in state_dict.items():
            if f".{i:d}." in key:
@@ -104,17 +94,24 @@ def block_expansion(
            output_state_dict[key] = value
    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
-    shards, index = shard_checkpoint(output_state_dict, max_shard_size=shard_size, weights_name=weights_name)
+    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+    state_dict_split = split_torch_state_dict_into_shards(
-    for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
+        output_state_dict, filename_pattern=filename_pattern, max_shard_size=shard_size
+    )
+    for shard_file, tensors in tqdm(state_dict_split.filename_to_tensors.items(), desc="Save weights"):
+        shard = {tensor: output_state_dict[tensor].contiguous() for tensor in tensors}
        if save_safetensors:
            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
        else:
            torch.save(shard, os.path.join(output_dir, shard_file))
-    if index is None:
+    if not state_dict_split.is_sharded:
        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}.")
    else:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
            json.dump(index, f, indent=2, sort_keys=True)

--- a/scripts/llamafy_baichuan2.py
+++ b/scripts/llamafy_baichuan2.py
-# Copyright 2024 the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-from collections import OrderedDict
-from typing import Any, Dict
-import fire
-import torch
-from safetensors.torch import save_file
-from tqdm import tqdm
-from transformers.modeling_utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    shard_checkpoint,
-)
-CONFIG_NAME = "config.json"
-def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool):
-    baichuan2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
-    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
-        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".bin"):
-            shard_weight = torch.load(os.path.join(input_dir, filepath), map_location="cpu")
-            baichuan2_state_dict.update(shard_weight)
-    llama2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
-    for key, value in tqdm(baichuan2_state_dict.items(), desc="Convert format"):
-        if "W_pack" in key:
-            proj_size = value.size(0) // 3
-            llama2_state_dict[key.replace("W_pack", "q_proj")] = value[:proj_size, :]
-            llama2_state_dict[key.replace("W_pack", "k_proj")] = value[proj_size : 2 * proj_size, :]
-            llama2_state_dict[key.replace("W_pack", "v_proj")] = value[2 * proj_size :, :]
-        elif "lm_head" in key:
-            llama2_state_dict[key] = torch.nn.functional.normalize(value)
-        else:
-            llama2_state_dict[key] = value
-    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
-    shards, index = shard_checkpoint(llama2_state_dict, max_shard_size=shard_size, weights_name=weights_name)
-    for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
-        if save_safetensors:
-            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
-        else:
-            torch.save(shard, os.path.join(output_dir, shard_file))
-    if index is None:
-        print(f"Model weights saved in {os.path.join(output_dir, WEIGHTS_NAME)}")
-    else:
-        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
-        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
-            json.dump(index, f, indent=2, sort_keys=True)
-        print(f"Model weights saved in {output_dir}")
-def save_config(input_dir: str, output_dir: str):
-    with open(os.path.join(input_dir, CONFIG_NAME), encoding="utf-8") as f:
-        llama2_config_dict: Dict[str, Any] = json.load(f)
-    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
-    llama2_config_dict.pop("auto_map", None)
-    llama2_config_dict.pop("tokenizer_class", None)
-    llama2_config_dict["model_type"] = "llama"
-    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
-        json.dump(llama2_config_dict, f, indent=2)
-    print(f"Model config saved in {os.path.join(output_dir, CONFIG_NAME)}")
-def llamafy_baichuan2(
-    input_dir: str,
-    output_dir: str,
-    shard_size: str = "2GB",
-    save_safetensors: bool = True,
-):
-    r"""
-    Converts the Baichuan2-7B model in the same format as LLaMA2-7B.
-    Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
-    Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
-    """
-    try:
-        os.makedirs(output_dir, exist_ok=False)
-    except Exception as e:
-        raise print("Output dir already exists", e)
-    save_weight(input_dir, output_dir, shard_size, save_safetensors)
-    save_config(input_dir, output_dir)
-if __name__ == "__main__":
-    fire.Fire(llamafy_baichuan2)
--- a/scripts/llamafy_qwen.py
+++ b/scripts/llamafy_qwen.py
-# Copyright 2024 the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-from collections import OrderedDict
-from typing import Any, Dict
-import fire
-import torch
-from safetensors import safe_open
-from safetensors.torch import save_file
-from tqdm import tqdm
-from transformers.modeling_utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    shard_checkpoint,
-)
-from transformers.utils import check_min_version
-try:
-    check_min_version("4.34.0")
-except Exception:
-    raise ValueError("Please upgrade `transformers` to 4.34.0")
-CONFIG_NAME = "config.json"
-def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool) -> str:
-    qwen_state_dict: Dict[str, torch.Tensor] = OrderedDict()
-    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
-        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".safetensors"):
-            with safe_open(os.path.join(input_dir, filepath), framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    qwen_state_dict[key] = f.get_tensor(key)
-    llama2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
-    torch_dtype = None
-    for key, value in tqdm(qwen_state_dict.items(), desc="Convert format"):
-        if torch_dtype is None:
-            torch_dtype = value.dtype
-        if "wte" in key:
-            llama2_state_dict["model.embed_tokens.weight"] = value
-        elif "ln_f" in key:
-            llama2_state_dict["model.norm.weight"] = value
-        else:
-            key = key.replace("transformer.h", "model.layers")
-            if "attn.c_attn" in key:
-                proj_size = value.size(0) // 3
-                llama2_state_dict[key.replace("attn.c_attn", "self_attn.q_proj")] = value[:proj_size, ...]
-                llama2_state_dict[key.replace("attn.c_attn", "self_attn.k_proj")] = value[
-                    proj_size : 2 * proj_size, ...
-                ]
-                llama2_state_dict[key.replace("attn.c_attn", "self_attn.v_proj")] = value[2 * proj_size :, ...]
-            elif "attn.c_proj" in key:
-                llama2_state_dict[key.replace("attn.c_proj", "self_attn.o_proj")] = value
-                llama2_state_dict[key.replace("attn.c_proj.weight", "self_attn.o_proj.bias")] = torch.zeros_like(
-                    value[:, 0]
-                ).squeeze()
-            elif "ln_1" in key:
-                llama2_state_dict[key.replace("ln_1", "input_layernorm")] = value
-            elif "ln_2" in key:
-                llama2_state_dict[key.replace("ln_2", "post_attention_layernorm")] = value
-            elif "mlp.w1" in key:
-                llama2_state_dict[key.replace("mlp.w1", "mlp.up_proj")] = value
-            elif "mlp.w2" in key:
-                llama2_state_dict[key.replace("mlp.w2", "mlp.gate_proj")] = value
-            elif "mlp.c_proj" in key:
-                llama2_state_dict[key.replace("mlp.c_proj", "mlp.down_proj")] = value
-            elif "lm_head" in key:
-                llama2_state_dict[key] = value
-            else:
-                raise KeyError(f"Unable to process key {key}")
-    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
-    shards, index = shard_checkpoint(llama2_state_dict, max_shard_size=shard_size, weights_name=weights_name)
-    for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
-        if save_safetensors:
-            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
-        else:
-            torch.save(shard, os.path.join(output_dir, shard_file))
-    if index is None:
-        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}")
-    else:
-        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
-        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
-            json.dump(index, f, indent=2, sort_keys=True)
-        print(f"Model weights saved in {output_dir}")
-    return str(torch_dtype).replace("torch.", "")
-def save_config(input_dir: str, output_dir: str, torch_dtype: str):
-    with open(os.path.join(input_dir, CONFIG_NAME), encoding="utf-8") as f:
-        qwen_config_dict: Dict[str, Any] = json.load(f)
-    llama2_config_dict: Dict[str, Any] = OrderedDict()
-    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
-    llama2_config_dict["hidden_act"] = "silu"
-    llama2_config_dict["hidden_size"] = qwen_config_dict["hidden_size"]
-    llama2_config_dict["initializer_range"] = qwen_config_dict["initializer_range"]
-    llama2_config_dict["intermediate_size"] = qwen_config_dict["intermediate_size"] // 2
-    llama2_config_dict["max_position_embeddings"] = qwen_config_dict["max_position_embeddings"]
-    llama2_config_dict["model_type"] = "llama"
-    llama2_config_dict["num_attention_heads"] = qwen_config_dict["num_attention_heads"]
-    llama2_config_dict["num_hidden_layers"] = qwen_config_dict["num_hidden_layers"]
-    llama2_config_dict["num_key_value_heads"] = qwen_config_dict["hidden_size"] // qwen_config_dict["kv_channels"]
-    llama2_config_dict["pretraining_tp"] = 1
-    llama2_config_dict["rms_norm_eps"] = qwen_config_dict["layer_norm_epsilon"]
-    llama2_config_dict["rope_scaling"] = None
-    llama2_config_dict["tie_word_embeddings"] = qwen_config_dict["tie_word_embeddings"]
-    llama2_config_dict["torch_dtype"] = torch_dtype
-    llama2_config_dict["transformers_version"] = "4.34.0"
-    llama2_config_dict["use_cache"] = True
-    llama2_config_dict["vocab_size"] = qwen_config_dict["vocab_size"]
-    llama2_config_dict["attention_bias"] = True
-    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
-        json.dump(llama2_config_dict, f, indent=2)
-    print(f"Model config saved in {os.path.join(output_dir, CONFIG_NAME)}")
-def llamafy_qwen(
-    input_dir: str,
-    output_dir: str,
-    shard_size: str = "2GB",
-    save_safetensors: bool = False,
-):
-    r"""
-    Converts the Qwen models in the same format as LLaMA2.
-    Usage: python llamafy_qwen.py --input_dir input --output_dir output
-    Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
-    """
-    try:
-        os.makedirs(output_dir, exist_ok=False)
-    except Exception as e:
-        raise print("Output dir already exists", e)
-    torch_dtype = save_weight(input_dir, output_dir, shard_size, save_safetensors)
-    save_config(input_dir, output_dir, torch_dtype)
-if __name__ == "__main__":
-    fire.Fire(llamafy_qwen)
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
-# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
 #
 # This code is based on the HuggingFace's PEFT library.
 # https://github.com/huggingface/peft/blob/v0.10.0/examples/loftq_finetuning/quantize_save_load.py

--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
-# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
 #
 # This code is based on the HuggingFace's PEFT library.
 # https://github.com/huggingface/peft/blob/v0.11.0/examples/pissa_finetuning/preprocess.py

--- a/scripts/stat_utils/cal_flops.py
+++ b/scripts/stat_utils/cal_flops.py
-# Copyright 2024 Microsoft Corporation and the LlamaFactory team.
+# Copyright 2025 Microsoft Corporation and the LlamaFactory team.
 #
 # This code is inspired by the Microsoft's DeepSpeed library.
 # https://www.deepspeed.ai/tutorials/flops-profiler/

--- a/scripts/stat_utils/cal_lr.py
+++ b/scripts/stat_utils/cal_lr.py
-# Copyright 2024 imoneoi and the LlamaFactory team.
+# Copyright 2025 imoneoi and the LlamaFactory team.
 #
 # This code is inspired by the imoneoi's OpenChat library.
 # https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py

--- a/scripts/stat_utils/cal_mfu.py
+++ b/scripts/stat_utils/cal_mfu.py
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2025 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -142,21 +142,23 @@ def calculate_mfu(
        args["deepspeed"] = f"examples/deepspeed/ds_z{deepspeed_stage}_config.json"
    run_exp(args)
-    with open(os.path.join("saves", "test_mfu", "all_results.json"), encoding="utf-8") as f:
-        result = json.load(f)
    if dist.is_initialized():
+        dist.barrier()
        world_size = dist.get_world_size()
    else:
        world_size = 1
-    total_batch_size = batch_size * world_size
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
-    mfu_value = (
+        with open(os.path.join("saves", "test_mfu", "all_results.json"), encoding="utf-8") as f:
-        result["train_steps_per_second"]
+            result = json.load(f)
-        * compute_model_flops(model_name_or_path, total_batch_size, seq_length)
-        / compute_device_flops(world_size)
+        total_batch_size = batch_size * world_size
-    )
+        mfu_value = (
-    print(f"MFU: {mfu_value * 100:.2f}%")
+            result["train_steps_per_second"]
+            * compute_model_flops(model_name_or_path, total_batch_size, seq_length)
+            / compute_device_flops(world_size)
+        )
+        print(f"MFU: {mfu_value * 100:.2f}%")
 if __name__ == "__main__":

--- a/scripts/stat_utils/cal_ppl.py
+++ b/scripts/stat_utils/cal_ppl.py
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2025 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -49,6 +49,7 @@ class PairwiseDataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
                    "labels": feature["chosen_input_ids"] if self.train_on_prompt else feature["chosen_labels"],
                    "images": feature["images"],
                    "videos": feature["videos"],
+                    "audios": feature["audios"],
                }
            )

--- a/scripts/stat_utils/length_cdf.py
+++ b/scripts/stat_utils/length_cdf.py
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2025 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/scripts/test_image.py
+++ b/scripts/test_image.py
-# Copyright 2024 the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from openai import OpenAI
-from transformers.utils.versions import require_version
-require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0")
-def main():
-    client = OpenAI(
-        api_key="{}".format(os.environ.get("API_KEY", "0")),
-        base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)),
-    )
-    messages = []
-    messages.append(
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "Output the color and number of each box."},
-                {
-                    "type": "image_url",
-                    "image_url": {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/boxes.png"},
-                },
-            ],
-        }
-    )
-    result = client.chat.completions.create(messages=messages, model="test")
-    messages.append(result.choices[0].message)
-    print("Round 1:", result.choices[0].message.content)
-    # The image shows a pyramid of colored blocks with numbers on them. Here are the colors and numbers of ...
-    messages.append(
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "What kind of flower is this?"},
-                {
-                    "type": "image_url",
-                    "image_url": {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/flowers.jpg"},
-                },
-            ],
-        }
-    )
-    result = client.chat.completions.create(messages=messages, model="test")
-    messages.append(result.choices[0].message)
-    print("Round 2:", result.choices[0].message.content)
-    # The image shows a cluster of forget-me-not flowers. Forget-me-nots are small ...
-if __name__ == "__main__":
-    main()