models add

5ed76316 · 雍大凯 · b2379236 · 5ed76316 · 5ed76316 · 5ed76316
Commit 5ed76316 authored Apr 08, 2026 by 雍大凯
20 changed files
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/api_example/test_toolcall.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/api_example/test_toolcall.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from openai import OpenAI
+from transformers.utils.versions import require_version
+
+
+require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0")
+
+
+def calculate_gpa(grades: list[str], hours: list[int]) -> float:
+    grade_to_score = {"A": 4, "B": 3, "C": 2}
+    total_score, total_hour = 0, 0
+    for grade, hour in zip(grades, hours):
+        total_score += grade_to_score[grade] * hour
+        total_hour += hour
+    return round(total_score / total_hour, 2)
+
+
+def main():
+    client = OpenAI(
+        api_key="{}".format(os.getenv("API_KEY", "0")),
+        base_url="http://localhost:{}/v1".format(os.getenv("API_PORT", 8000)),
+    )
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "calculate_gpa",
+                "description": "Calculate the Grade Point Average (GPA) based on grades and credit hours",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "grades": {"type": "array", "items": {"type": "string"}, "description": "The grades"},
+                        "hours": {"type": "array", "items": {"type": "integer"}, "description": "The credit hours"},
+                    },
+                    "required": ["grades", "hours"],
+                },
+            },
+        }
+    ]
+    tool_map = {"calculate_gpa": calculate_gpa}
+
+    messages = []
+    messages.append({"role": "user", "content": "My grades are A, A, B, and C. The credit hours are 3, 4, 3, and 2."})
+    result = client.chat.completions.create(messages=messages, model="test", tools=tools)
+    if result.choices[0].message.tool_calls is None:
+        raise ValueError("Cannot retrieve function call from the response.")
+
+    messages.append(result.choices[0].message)
+    tool_call = result.choices[0].message.tool_calls[0].function
+    print(tool_call)
+    # Function(arguments='{"grades": ["A", "A", "B", "C"], "hours": [3, 4, 3, 2]}', name='calculate_gpa')
+    name, arguments = tool_call.name, json.loads(tool_call.arguments)
+    tool_result = tool_map[name](**arguments)
+    messages.append({"role": "tool", "content": json.dumps({"gpa": tool_result}, ensure_ascii=False)})
+    result = client.chat.completions.create(messages=messages, model="test", tools=tools)
+    print(result.choices[0].message.content)
+    # Based on the grades and credit hours you provided, your Grade Point Average (GPA) is 3.42.
+
+
+if __name__ == "__main__":
+    main()
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_baichuan2.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_baichuan2.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import OrderedDict
+from typing import Any
+
+import fire
+import torch
+from huggingface_hub import split_torch_state_dict_into_shards
+from safetensors.torch import save_file
+from tqdm import tqdm
+from transformers.modeling_utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
+
+
+CONFIG_NAME = "config.json"
+
+
+def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool):
+    baichuan2_state_dict: dict[str, torch.Tensor] = OrderedDict()
+    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
+        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".bin"):
+            shard_weight = torch.load(os.path.join(input_dir, filepath), map_location="cpu", weights_only=True)
+            baichuan2_state_dict.update(shard_weight)
+
+    llama_state_dict: dict[str, torch.Tensor] = OrderedDict()
+    for key, value in tqdm(baichuan2_state_dict.items(), desc="Convert format"):
+        if "W_pack" in key:
+            proj_size = value.size(0) // 3
+            llama_state_dict[key.replace("W_pack", "q_proj")] = value[:proj_size, :]
+            llama_state_dict[key.replace("W_pack", "k_proj")] = value[proj_size : 2 * proj_size, :]
+            llama_state_dict[key.replace("W_pack", "v_proj")] = value[2 * proj_size :, :]
+        elif "lm_head" in key:
+            llama_state_dict[key] = torch.nn.functional.normalize(value)
+        else:
+            llama_state_dict[key] = value
+
+    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
+    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+    state_dict_split = split_torch_state_dict_into_shards(
+        llama_state_dict, filename_pattern=filename_pattern, max_shard_size=shard_size
+    )
+    for shard_file, tensors in tqdm(state_dict_split.filename_to_tensors.items(), desc="Save weights"):
+        shard = {tensor: llama_state_dict[tensor].contiguous() for tensor in tensors}
+        if save_safetensors:
+            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
+        else:
+            torch.save(shard, os.path.join(output_dir, shard_file))
+
+    if not state_dict_split.is_sharded:
+        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}.")
+    else:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
+        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
+            json.dump(index, f, indent=2, sort_keys=True)
+
+        print(f"Model weights saved in {output_dir}.")
+
+
+def save_config(input_dir: str, output_dir: str):
+    with open(os.path.join(input_dir, CONFIG_NAME), encoding="utf-8") as f:
+        llama2_config_dict: dict[str, Any] = json.load(f)
+
+    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
+    llama2_config_dict.pop("auto_map", None)
+    llama2_config_dict.pop("tokenizer_class", None)
+    llama2_config_dict["model_type"] = "llama"
+
+    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
+        json.dump(llama2_config_dict, f, indent=2)
+
+    print(f"Model config saved in {os.path.join(output_dir, CONFIG_NAME)}")
+
+
+def llamafy_baichuan2(
+    input_dir: str,
+    output_dir: str,
+    shard_size: str = "2GB",
+    save_safetensors: bool = True,
+):
+    r"""Convert the Baichuan2-7B model in the same format as LLaMA2-7B.
+
+    Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
+    Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
+    """
+    try:
+        os.makedirs(output_dir, exist_ok=False)
+    except Exception as e:
+        raise print("Output dir already exists", e)
+
+    save_weight(input_dir, output_dir, shard_size, save_safetensors)
+    save_config(input_dir, output_dir)
+
+
+if __name__ == "__main__":
+    fire.Fire(llamafy_baichuan2)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_qwen.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_qwen.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import OrderedDict
+from typing import Any
+
+import fire
+import torch
+from huggingface_hub import split_torch_state_dict_into_shards
+from safetensors import safe_open
+from safetensors.torch import save_file
+from tqdm import tqdm
+from transformers.modeling_utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
+from transformers.utils import check_min_version
+
+
+try:
+    check_min_version("4.34.0")
+except Exception:
+    raise ValueError("Please upgrade `transformers` to 4.34.0")
+
+
+CONFIG_NAME = "config.json"
+
+
+def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool) -> str:
+    qwen_state_dict: dict[str, torch.Tensor] = OrderedDict()
+    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
+        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".safetensors"):
+            with safe_open(os.path.join(input_dir, filepath), framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    qwen_state_dict[key] = f.get_tensor(key)
+
+    llama_state_dict: dict[str, torch.Tensor] = OrderedDict()
+    torch_dtype = None
+    for key, value in tqdm(qwen_state_dict.items(), desc="Convert format"):
+        if torch_dtype is None:
+            torch_dtype = value.dtype
+        if "wte" in key:
+            llama_state_dict["model.embed_tokens.weight"] = value
+        elif "ln_f" in key:
+            llama_state_dict["model.norm.weight"] = value
+        else:
+            key = key.replace("transformer.h", "model.layers")
+            if "attn.c_attn" in key:
+                proj_size = value.size(0) // 3
+                llama_state_dict[key.replace("attn.c_attn", "self_attn.q_proj")] = value[:proj_size, ...]
+                llama_state_dict[key.replace("attn.c_attn", "self_attn.k_proj")] = value[
+                    proj_size : 2 * proj_size, ...
+                ]
+                llama_state_dict[key.replace("attn.c_attn", "self_attn.v_proj")] = value[2 * proj_size :, ...]
+            elif "attn.c_proj" in key:
+                llama_state_dict[key.replace("attn.c_proj", "self_attn.o_proj")] = value
+                llama_state_dict[key.replace("attn.c_proj.weight", "self_attn.o_proj.bias")] = torch.zeros_like(
+                    value[:, 0]
+                ).squeeze()
+            elif "ln_1" in key:
+                llama_state_dict[key.replace("ln_1", "input_layernorm")] = value
+            elif "ln_2" in key:
+                llama_state_dict[key.replace("ln_2", "post_attention_layernorm")] = value
+            elif "mlp.w1" in key:
+                llama_state_dict[key.replace("mlp.w1", "mlp.up_proj")] = value
+            elif "mlp.w2" in key:
+                llama_state_dict[key.replace("mlp.w2", "mlp.gate_proj")] = value
+            elif "mlp.c_proj" in key:
+                llama_state_dict[key.replace("mlp.c_proj", "mlp.down_proj")] = value
+            elif "lm_head" in key:
+                llama_state_dict[key] = value
+            else:
+                raise KeyError(f"Unable to process key {key}")
+
+    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
+    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+    state_dict_split = split_torch_state_dict_into_shards(
+        llama_state_dict, filename_pattern=filename_pattern, max_shard_size=shard_size
+    )
+    for shard_file, tensors in tqdm(state_dict_split.filename_to_tensors.items(), desc="Save weights"):
+        shard = {tensor: llama_state_dict[tensor].contiguous() for tensor in tensors}
+        if save_safetensors:
+            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
+        else:
+            torch.save(shard, os.path.join(output_dir, shard_file))
+
+    if not state_dict_split.is_sharded:
+        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}.")
+    else:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
+        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
+            json.dump(index, f, indent=2, sort_keys=True)
+
+        print(f"Model weights saved in {output_dir}.")
+
+    return str(torch_dtype).replace("torch.", "")
+
+
+def save_config(input_dir: str, output_dir: str, torch_dtype: str):
+    with open(os.path.join(input_dir, CONFIG_NAME), encoding="utf-8") as f:
+        qwen_config_dict: dict[str, Any] = json.load(f)
+
+    llama2_config_dict: dict[str, Any] = OrderedDict()
+    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
+    llama2_config_dict["hidden_act"] = "silu"
+    llama2_config_dict["hidden_size"] = qwen_config_dict["hidden_size"]
+    llama2_config_dict["initializer_range"] = qwen_config_dict["initializer_range"]
+    llama2_config_dict["intermediate_size"] = qwen_config_dict["intermediate_size"] // 2
+    llama2_config_dict["max_position_embeddings"] = qwen_config_dict["max_position_embeddings"]
+    llama2_config_dict["model_type"] = "llama"
+    llama2_config_dict["num_attention_heads"] = qwen_config_dict["num_attention_heads"]
+    llama2_config_dict["num_hidden_layers"] = qwen_config_dict["num_hidden_layers"]
+    llama2_config_dict["num_key_value_heads"] = qwen_config_dict["hidden_size"] // qwen_config_dict["kv_channels"]
+    llama2_config_dict["pretraining_tp"] = 1
+    llama2_config_dict["rms_norm_eps"] = qwen_config_dict["layer_norm_epsilon"]
+    llama2_config_dict["rope_scaling"] = None
+    llama2_config_dict["tie_word_embeddings"] = qwen_config_dict["tie_word_embeddings"]
+    llama2_config_dict["torch_dtype"] = torch_dtype
+    llama2_config_dict["transformers_version"] = "4.34.0"
+    llama2_config_dict["use_cache"] = True
+    llama2_config_dict["vocab_size"] = qwen_config_dict["vocab_size"]
+    llama2_config_dict["attention_bias"] = True
+
+    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
+        json.dump(llama2_config_dict, f, indent=2)
+
+    print(f"Model config saved in {os.path.join(output_dir, CONFIG_NAME)}")
+
+
+def llamafy_qwen(
+    input_dir: str,
+    output_dir: str,
+    shard_size: str = "2GB",
+    save_safetensors: bool = False,
+):
+    r"""Convert the Qwen models in the same format as LLaMA2.
+
+    Usage: python llamafy_qwen.py --input_dir input --output_dir output
+    Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
+    """
+    try:
+        os.makedirs(output_dir, exist_ok=False)
+    except Exception as e:
+        raise print("Output dir already exists", e)
+
+    torch_dtype = save_weight(input_dir, output_dir, shard_size, save_safetensors)
+    save_config(input_dir, output_dir, torch_dtype)
+
+
+if __name__ == "__main__":
+    fire.Fire(llamafy_qwen)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/tiny_llama4.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/tiny_llama4.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import Llama4Config, Llama4ForConditionalGeneration, Llama4TextConfig, Llama4VisionConfig
+
+
+if __name__ == "__main__":
+    vision_config = Llama4VisionConfig(
+        hidden_size=1408,
+        image_size=336,
+        intermediate_size=5632,
+        num_attention_heads=16,
+        num_hidden_layers=4,
+        vision_output_dim=4096,
+    )
+    text_config = Llama4TextConfig(
+        hidden_size=512,
+        intermediate_size=1024,
+        intermediate_size_mlp=1024,
+        num_hidden_layers=4,
+        num_attention_heads=8,
+        num_key_value_heads=2,
+        head_dim=512 // 8,
+        num_local_experts=2,
+    )
+    config = Llama4Config(vision_config=vision_config, text_config=text_config)
+    model = Llama4ForConditionalGeneration._from_config(config)
+    model.save_pretrained("tiny-llama4")
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/eval_bleu_rouge.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/eval_bleu_rouge.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import time
+
+import fire
+from datasets import load_dataset
+
+
+try:
+    import jieba  # type: ignore
+    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu  # type: ignore
+    from rouge_chinese import Rouge  # type: ignore
+
+    jieba.setLogLevel(logging.CRITICAL)
+    jieba.initialize()
+except ImportError:
+    print("Please install llamafactory with `pip install -e .[metrics]`.")
+    raise
+
+
+def compute_metrics(sample):
+    hypothesis = list(jieba.cut(sample["predict"]))
+    reference = list(jieba.cut(sample["label"]))
+
+    bleu_score = sentence_bleu(
+        [list(sample["label"])],
+        list(sample["predict"]),
+        smoothing_function=SmoothingFunction().method3,
+    )
+
+    if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
+        result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
+    else:
+        rouge = Rouge()
+        scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
+        result = scores[0]
+
+    metric_result = {}
+    for k, v in result.items():
+        metric_result[k] = round(v["f"] * 100, 4)
+
+    metric_result["bleu-4"] = round(bleu_score * 100, 4)
+
+    return metric_result
+
+
+def main(filename: str):
+    start_time = time.time()
+    dataset = load_dataset("json", data_files=filename, split="train")
+    dataset = dataset.map(compute_metrics, num_proc=8, remove_columns=dataset.column_names)
+    score_dict = dataset.to_dict()
+
+    average_score = {}
+    for task, scores in sorted(score_dict.items(), key=lambda x: x[0]):
+        print(f"{task}: {sum(scores) / len(scores):.4f}")
+        average_score[task] = sum(scores) / len(scores)
+
+    with open("predictions_score.json", "w", encoding="utf-8") as f:
+        json.dump(average_score, f, indent=4)
+
+    print(f"\nDone in {time.time() - start_time:.3f}s.\nScore file saved to predictions_score.json")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/llama_pro.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/llama_pro.py
+# Copyright 2025 Tencent Inc. and the LlamaFactory team.
+#
+# This code is inspired by the Tencent's LLaMA-Pro library.
+# https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import OrderedDict
+from typing import TYPE_CHECKING
+
+import fire
+import torch
+from huggingface_hub import split_torch_state_dict_into_shards
+from safetensors.torch import save_file
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
+from transformers.modeling_utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+
+def change_name(name: str, old_index: int, new_index: int) -> str:
+    return name.replace(f".{old_index:d}.", f".{new_index:d}.")
+
+
+def block_expansion(
+    model_name_or_path: str,
+    output_dir: str,
+    num_expand: int,
+    shard_size: str = "5GB",
+    save_safetensors: bool = True,
+):
+    r"""Perform block expansion for LLaMA, Mistral, Qwen2 or Yi models.
+
+    Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
+    """
+    config: PretrainedConfig = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
+    num_layers = getattr(config, "num_hidden_layers")
+    if num_layers % num_expand != 0:
+        raise ValueError(f"`num_layers` {num_layers} should be divisible by `num_expand` {num_expand}.")
+
+    setattr(config, "num_hidden_layers", num_layers + num_expand)
+    config.save_pretrained(output_dir)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    tokenizer.save_pretrained(output_dir)
+
+    print(f"Expanding model of {num_layers} layers to {num_layers + num_expand} layers.")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name_or_path, torch_dtype="auto", device_map="cpu", trust_remote_code=True, low_cpu_mem_usage=True
+    )
+    assert isinstance(model, PreTrainedModel)  # type hint
+    if save_safetensors and getattr(model.config, "tie_word_embeddings", False):
+        del model.lm_head  # safetensors does not allow shared weights
+
+    split = num_layers // num_expand
+    layer_cnt = 0
+    state_dict = model.state_dict()
+    output_state_dict: dict[str, torch.Tensor] = OrderedDict()
+    for i in range(num_layers):
+        for key, value in state_dict.items():
+            if f".{i:d}." in key:
+                output_state_dict[change_name(key, i, layer_cnt)] = value
+
+        print(f"Add layer {layer_cnt} copied from layer {i}.")
+        layer_cnt += 1
+        if (i + 1) % split == 0:
+            for key, value in state_dict.items():
+                if f".{i:d}." in key:
+                    if "down_proj" in key or "o_proj" in key:
+                        output_state_dict[change_name(key, i, layer_cnt)] = torch.zeros_like(value)
+                    else:
+                        output_state_dict[change_name(key, i, layer_cnt)] = torch.clone(value)
+
+            print(f"Add layer {layer_cnt} expanded from layer {i}.")
+            layer_cnt += 1
+
+    for key, value in state_dict.items():
+        if key not in output_state_dict:
+            output_state_dict[key] = value
+
+    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
+    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+    state_dict_split = split_torch_state_dict_into_shards(
+        output_state_dict, filename_pattern=filename_pattern, max_shard_size=shard_size
+    )
+    for shard_file, tensors in tqdm(state_dict_split.filename_to_tensors.items(), desc="Save weights"):
+        shard = {tensor: output_state_dict[tensor].contiguous() for tensor in tensors}
+        if save_safetensors:
+            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
+        else:
+            torch.save(shard, os.path.join(output_dir, shard_file))
+
+    if not state_dict_split.is_sharded:
+        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}.")
+    else:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
+        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
+            json.dump(index, f, indent=2, sort_keys=True)
+
+        print(f"Model weights saved in {output_dir}.")
+
+    print("- Fine-tune this model with:")
+    print(f"model_name_or_path: {output_dir}")
+    print("finetuning_type: freeze")
+    print(f"freeze_trainable_layers: {num_expand}")
+    print("use_llama_pro: true")
+
+
+if __name__ == "__main__":
+    fire.Fire(block_expansion)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/loftq_init.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/loftq_init.py
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is based on the HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.10.0/examples/loftq_finetuning/quantize_save_load.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING
+
+import fire
+from peft import LoftQConfig, LoraConfig, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+
+def quantize_loftq(
+    model_name_or_path: str,
+    output_dir: str,
+    loftq_bits: int = 4,
+    loftq_iter: int = 4,
+    lora_alpha: int = None,
+    lora_rank: int = 16,
+    lora_dropout: float = 0,
+    lora_target: tuple = ("q_proj", "v_proj"),
+    save_safetensors: bool = True,
+):
+    r"""Initialize LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ).
+
+    Usage: python loftq_init.py --model_name_or_path path_to_model --output_dir output_dir
+    """
+    if isinstance(lora_target, str):
+        lora_target = [name.strip() for name in lora_target.split(",")]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
+
+    loftq_config = LoftQConfig(loftq_bits=loftq_bits, loftq_iter=loftq_iter)
+    lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        inference_mode=True,
+        r=lora_rank,
+        lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
+        lora_dropout=lora_dropout,
+        target_modules=lora_target,
+        init_lora_weights="loftq",
+        loftq_config=loftq_config,
+    )
+
+    # Init LoftQ model
+    print("Initializing LoftQ weights, it may be take several minutes, wait patiently.")
+    peft_model = get_peft_model(model, lora_config)
+    loftq_dir = os.path.join(output_dir, "loftq_init")
+
+    # Save LoftQ model
+    setattr(peft_model.peft_config["default"], "base_model_name_or_path", os.path.abspath(output_dir))
+    setattr(peft_model.peft_config["default"], "init_lora_weights", True)  # don't apply loftq again
+    peft_model.save_pretrained(loftq_dir, safe_serialization=save_safetensors)
+    print(f"Adapter weights saved in {loftq_dir}")
+
+    # Save base model
+    base_model: PreTrainedModel = peft_model.unload()
+    base_model.save_pretrained(output_dir, safe_serialization=save_safetensors)
+    tokenizer.save_pretrained(output_dir)
+    print(f"Model weights saved in {output_dir}")
+
+    print("- Fine-tune this model with:")
+    print(f"model_name_or_path: {output_dir}")
+    print(f"adapter_name_or_path: {loftq_dir}")
+    print("finetuning_type: lora")
+    print(f"quantization_bit: {loftq_bits}")
+
+
+if __name__ == "__main__":
+    fire.Fire(quantize_loftq)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/pissa_init.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/pissa_init.py
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is based on the HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.11.0/examples/pissa_finetuning/preprocess.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING
+
+import fire
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+
+def quantize_pissa(
+    model_name_or_path: str,
+    output_dir: str,
+    pissa_iter: int = 16,
+    lora_alpha: int = None,
+    lora_rank: int = 16,
+    lora_dropout: float = 0,
+    lora_target: tuple = ("q_proj", "v_proj"),
+    save_safetensors: bool = True,
+):
+    r"""Initialize LoRA weights with Principal Singular values and Singular vectors Adaptation (PiSSA).
+
+    Usage: python pissa_init.py --model_name_or_path path_to_model --output_dir output_dir
+    """
+    if isinstance(lora_target, str):
+        lora_target = [name.strip() for name in lora_target.split(",")]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
+
+    lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=lora_rank,
+        lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
+        lora_dropout=lora_dropout,
+        target_modules=lora_target,
+        init_lora_weights="pissa" if pissa_iter == -1 else f"pissa_niter_{pissa_iter}",
+    )
+
+    # Init PiSSA model
+    peft_model = get_peft_model(model, lora_config)
+    pissa_dir = os.path.join(output_dir, "pissa_init")
+
+    # Save PiSSA model
+    setattr(peft_model.peft_config["default"], "base_model_name_or_path", os.path.abspath(output_dir))
+    setattr(peft_model.peft_config["default"], "init_lora_weights", True)  # don't apply pissa again
+    peft_model.save_pretrained(pissa_dir, safe_serialization=save_safetensors)
+    print(f"Adapter weights saved in {pissa_dir}")
+
+    # Save base model
+    base_model: PreTrainedModel = peft_model.unload()
+    base_model.save_pretrained(output_dir, safe_serialization=save_safetensors)
+    tokenizer.save_pretrained(output_dir)
+    print(f"Model weights saved in {output_dir}")
+
+    print("- Fine-tune this model with:")
+    print(f"model_name_or_path: {output_dir}")
+    print(f"adapter_name_or_path: {pissa_dir}")
+    print("finetuning_type: lora")
+    print("pissa_init: false")
+    print("pissa_convert: true")
+    print("- and optionally with:")
+    print("quantization_bit: 4")
+
+
+if __name__ == "__main__":
+    fire.Fire(quantize_pissa)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/qwen_omni_merge.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/qwen_omni_merge.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Why we need this script for qwen_omni?
+
+Because the qwen_omni model is constructed by two parts:
+1. [Thinker]:[audio_encoder, vision_encoder, LLM backbone], which our repository does support to post-training.
+2. [Talker]: [audio_decoder, wave_model], which is not supported to post-training without specific tokenizer.
+When we post-training the model, we exactly train the [Thinker] part, and the [Talker] part is dropped.
+So, to get the complete model, we need to merge the [Talker] part back to the [Thinker] part.
+LoRA mode: [Thinker + LoRA weights] + [Original Talker] -> [Omni model]
+Full mode: [Thinker] + [Original Talker] -> [Omni model]
+For Processor, we do saved the processor from trained model instead of the original model.
+"""
+
+import os
+import shutil
+
+import fire
+from peft import PeftModel
+from transformers import (
+    AutoProcessor,
+    Qwen2_5OmniForConditionalGeneration,  # type: ignore
+    Qwen2_5OmniThinkerForConditionalGeneration,
+)
+
+
+def merge_lora(
+    base_model_path: str,
+    lora_checkpoint_path: str,
+    extra_file: str = "spk_dict.pt",
+    submodule_name: str = "thinker",
+    save_path: str = "./merged_model_checkpoint",
+):
+    """Load the original model, merge the LoRA weights.
+
+    For a specified submodule, and save the final merged model along with its configurations.
+
+    Args:
+        base_model_path (str): Path to the original model directory.
+        lora_checkpoint_path (str): Path to the directory containing LoRA weights.
+        extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
+        submodule_name (str): Name of the submodule to merge (default: "thinker").
+        save_path (str): Directory where the merged model and configurations will be saved.
+    """
+    # 1. Load the original model
+    model = Qwen2_5OmniForConditionalGeneration.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
+    print("Successfully loaded the original model.")
+
+    # 2. Extract the submodule to be merged (e.g., model.thinker)
+    if not hasattr(model, submodule_name):
+        raise AttributeError(f"The model does not have a submodule named '{submodule_name}'.")
+
+    base_submodule = getattr(model, submodule_name)
+    print(f"Successfully extracted submodule: {submodule_name}.")
+
+    # 3. Load the LoRA weights onto the extracted submodule
+    lora_model = PeftModel.from_pretrained(base_submodule, lora_checkpoint_path)
+    processor = AutoProcessor.from_pretrained(lora_checkpoint_path)
+    print("LoRA weights and processor loaded successfully.")
+
+    # 4. Merge the LoRA weights into the submodule and unload the LoRA modules
+    merged_submodule = lora_model.merge_and_unload()
+    print("LoRA weights merged successfully.")
+
+    # 5. Replace the original submodule with the merged submodule in the model
+    setattr(model, submodule_name, merged_submodule)
+
+    # 6. Save the final merged model along with the tokenizer and processor configuration
+    model.save_pretrained(save_path)
+    processor.save_pretrained(save_path)
+    print(f"Merged model and tokenizer saved to {save_path}.")
+
+    source_file = os.path.join(base_model_path, extra_file)
+    target_file = os.path.join(save_path, extra_file)
+    if os.path.exists(source_file):
+        shutil.copy(source_file, target_file)
+        print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.")
+    else:
+        print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.")
+
+
+def save_full_model(
+    saved_thinker_path: str,
+    base_model_path: str,
+    save_path: str = "./merged_model_checkpoint",
+    extra_file: str = "spk_dict.pt",
+):
+    """Load the saved thinker module and the original model, replace the thinker in the original model.
+
+    Then save the complete model along with its tokenizer and processor configuration.
+
+    Args:
+        saved_thinker_path (str): Path to the saved thinker weights.
+        base_model_path (str): Directory path of the original model.
+        save_path (str): Directory where the merged model and configurations will be saved.
+        extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
+    """
+    # 1. Load the saved thinker module and the original model
+    thinker = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
+        saved_thinker_path, torch_dtype="auto", device_map="cpu"
+    )
+    base_model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+        base_model_path, torch_dtype="auto", device_map="cpu"
+    )
+    base_model.thinker = thinker
+
+    # 2. Save the complete model along with its tokenizer and processor configuration
+    processor = AutoProcessor.from_pretrained(saved_thinker_path)
+    base_model.save_pretrained(save_path)
+    processor.save_pretrained(save_path)
+    print(f"Merged model and processor saved to {save_path}.")
+
+    # 3. Copy the extra file from the base model directory to the save_path
+    source_file = os.path.join(base_model_path, extra_file)
+    target_file = os.path.join(save_path, extra_file)
+    if os.path.exists(source_file):
+        shutil.copy(source_file, target_file)
+        print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.")
+    else:
+        print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.")
+
+
+if __name__ == "__main__":
+    fire.Fire({"save_full": save_full_model, "merge_lora": merge_lora})
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_flops.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_flops.py
+# Copyright 2025 Microsoft Corporation and the LlamaFactory team.
+#
+# This code is inspired by the Microsoft's DeepSpeed library.
+# https://www.deepspeed.ai/tutorials/flops-profiler/
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fire
+import torch
+from deepspeed.accelerator import get_accelerator  # type: ignore
+from deepspeed.profiling.flops_profiler import get_model_profile  # type: ignore
+
+from llamafactory.chat import ChatModel
+
+
+def calculate_flops(
+    model_name_or_path: str,
+    batch_size: int = 1,
+    seq_length: int = 512,
+    flash_attn: str = "auto",
+):
+    r"""Calculate the flops of pre-trained models.
+
+    Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
+    """
+    with get_accelerator().device(0):
+        chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
+        fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.engine.model.device)
+        input_dict = {"input_ids": fake_input, "labels": fake_input.clone()}
+        flops, macs, params = get_model_profile(
+            chat_model.engine.model, kwargs=input_dict, print_profile=True, detailed=True
+        )
+        print("FLOPs:", flops)
+        print("MACs:", macs)
+        print("Params:", params)
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_flops)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_lr.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_lr.py
+# Copyright 2025 imoneoi and the LlamaFactory team.
+#
+# This code is inspired by the imoneoi's OpenChat library.
+# https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Literal
+
+import fire
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import DataCollatorForLanguageModeling
+
+from llamafactory.data import MultiModalDataCollatorForSeq2Seq, get_dataset, get_template_and_fix_tokenizer
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_tokenizer
+
+
+BASE_LR = 3e-4  # 1.5e-4 for 30B-70B models
+BASE_BS = 4_000_000  # from llama paper
+
+
+def calculate_lr(
+    model_name_or_path: str,
+    batch_size: int,  # total batch size, namely (batch size * gradient accumulation * world size)
+    stage: Literal["pt", "sft"] = "sft",
+    dataset: str = "alpaca_en_demo",
+    dataset_dir: str = "data",
+    template: str = "default",
+    cutoff_len: int = 2048,  # i.e. maximum input length during training
+    is_mistral_or_gemma: bool = False,  # mistral and gemma models opt for a smaller learning rate,
+    packing: bool = False,
+):
+    r"""Calculate the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
+
+    Usage:
+    python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en_demo --cutoff_len 1024 --batch_size 16
+    """
+    model_args, data_args, training_args, _, _ = get_train_args(
+        dict(
+            stage=stage,
+            model_name_or_path=model_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=cutoff_len,
+            packing=packing,
+            preprocessing_num_workers=16,
+            output_dir="dummy_dir",
+            overwrite_cache=True,
+            do_train=True,
+        )
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
+    if stage == "pt":
+        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    elif stage == "sft":
+        data_collator = MultiModalDataCollatorForSeq2Seq(
+            template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX
+        )
+    else:
+        raise NotImplementedError(f"Stage does not supported: {stage}.")
+
+    dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
+    valid_tokens, total_tokens = 0, 0
+    for batch in tqdm(dataloader, desc="Collecting valid tokens"):
+        valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item()
+        total_tokens += torch.numel(batch["labels"])
+
+    valid_ratio = valid_tokens / total_tokens
+    token_batch_size = cutoff_len * batch_size * valid_ratio
+    lr = BASE_LR * math.sqrt(token_batch_size / BASE_BS)  # lr ~ sqrt(batch_size)
+    lr = lr / 6.0 if is_mistral_or_gemma else lr
+    print(
+        f"Optimal learning rate is {lr:.2e} for valid ratio% {valid_ratio * 100:.2f} "
+        f"and effective token batch size {token_batch_size:.2f}"
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_lr)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_mfu.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_mfu.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import fire
+import torch
+import torch.distributed as dist
+from transformers import AutoConfig
+
+from llamafactory.train.tuner import run_exp
+
+
+BASE = 2  # gemm (add + mul)
+
+
+def compute_model_flops(
+    model_name_or_path: str,
+    total_batch_size: int,
+    seq_length: int,
+    include_backward: bool = True,
+    include_recompute: bool = False,
+    include_flashattn: bool = False,
+) -> int:
+    r"""Calculate the FLOPs of model per forward/backward pass."""
+    config = AutoConfig.from_pretrained(model_name_or_path)
+    hidden_size = getattr(config, "hidden_size", None)
+    vocab_size = getattr(config, "vocab_size", None)
+    intermediate_size = getattr(config, "intermediate_size", None)
+    num_attention_heads = getattr(config, "num_attention_heads", None)
+    num_key_value_heads = getattr(config, "num_key_value_heads", None)
+    num_hidden_layers = getattr(config, "num_hidden_layers", None)
+    tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
+
+    # mlp module
+    mlp_flops_per_token = 3 * BASE * hidden_size * intermediate_size  # up, gate, down
+    mlp_flops = total_batch_size * seq_length * num_hidden_layers * mlp_flops_per_token
+
+    # attn projector module
+    q_flops_per_token = BASE * hidden_size * hidden_size
+    o_flops_per_token = BASE * hidden_size * hidden_size
+    k_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
+    v_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
+    attn_proj_flops_per_token = q_flops_per_token + o_flops_per_token + k_flops_per_token + v_flops_per_token
+    attn_proj_flops = total_batch_size * seq_length * num_hidden_layers * attn_proj_flops_per_token
+
+    # attn sdpa module
+    sdpa_flops_per_layer = 2 * BASE * hidden_size * seq_length * seq_length  # (q * k^T) * v
+    sdpa_flops = total_batch_size * num_hidden_layers * sdpa_flops_per_layer
+
+    # embedding module
+    embedding_flops_per_token = hidden_size * vocab_size
+    embedding_flops = total_batch_size * seq_length * embedding_flops_per_token
+    if tie_word_embeddings is False:
+        embedding_flops *= 2
+
+    non_embedding_flops = mlp_flops + attn_proj_flops + sdpa_flops
+    non_embedding_coeff, embedding_coeff = 1, 1
+    if include_backward:
+        non_embedding_coeff += 2
+        embedding_coeff += 2
+
+    if include_recompute:
+        non_embedding_coeff += 1
+
+    total_flops = non_embedding_coeff * non_embedding_flops + embedding_coeff * embedding_flops
+
+    if include_flashattn:
+        total_flops += sdpa_flops
+
+    return total_flops
+
+
+def compute_device_flops(world_size: int) -> float:
+    r"""Calculate the FLOPs of the device capability per second."""
+    device_name = torch.cuda.get_device_name()
+    if "H100" in device_name or "H800" in device_name:
+        return 989 * 1e12 * world_size
+    elif "A100" in device_name or "A800" in device_name:
+        return 312 * 1e12 * world_size
+    elif "V100" in device_name:
+        return 125 * 1e12 * world_size
+    elif "4090" in device_name:
+        return 98 * 1e12 * world_size
+    else:
+        raise NotImplementedError(f"Device not supported: {device_name}.")
+
+
+def calculate_mfu(
+    model_name_or_path: str,
+    batch_size: int = 1,
+    seq_length: int = 1024,
+    num_steps: int = 100,
+    finetuning_type: str = "lora",
+    flash_attn: str = "auto",
+    deepspeed_stage: int = 0,
+    disable_gc: bool = False,
+    liger_kernel: bool = False,
+    unsloth_gc: bool = False,
+) -> float:
+    r"""Calculate MFU for given model and hyper-params.
+
+    Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024
+    """
+    args = {
+        "model_name_or_path": model_name_or_path,
+        "flash_attn": flash_attn,
+        "disable_gradient_checkpointing": disable_gc,
+        "enable_liger_kernel": liger_kernel,
+        "use_unsloth_gc": unsloth_gc,
+        "stage": "pt",
+        "do_train": True,
+        "finetuning_type": finetuning_type,
+        "dataset": "c4_demo",
+        "cutoff_len": seq_length,
+        "output_dir": os.path.join("saves", "test_mfu"),
+        "logging_strategy": "no",
+        "save_strategy": "no",
+        "save_only_model": True,
+        "overwrite_output_dir": True,
+        "per_device_train_batch_size": batch_size,
+        "max_steps": num_steps,
+        "bf16": True,
+    }
+    if deepspeed_stage in [2, 3]:
+        args["deepspeed"] = f"examples/deepspeed/ds_z{deepspeed_stage}_config.json"
+
+    run_exp(args)
+    if dist.is_initialized():
+        dist.barrier()
+        world_size = dist.get_world_size()
+    else:
+        world_size = 1
+
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        with open(os.path.join("saves", "test_mfu", "all_results.json"), encoding="utf-8") as f:
+            result = json.load(f)
+
+        total_batch_size = batch_size * world_size
+        mfu_value = (
+            result["train_steps_per_second"]
+            * compute_model_flops(model_name_or_path, total_batch_size, seq_length)
+            / compute_device_flops(world_size)
+        )
+        print(f"MFU: {mfu_value * 100:.2f}%")
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_mfu)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_ppl.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_ppl.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from dataclasses import dataclass
+from typing import Any, Literal, Optional
+
+import fire
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import DataCollatorForLanguageModeling
+
+from llamafactory.data import MultiModalDataCollatorForSeq2Seq, get_dataset, get_template_and_fix_tokenizer
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_model, load_tokenizer
+
+
+@dataclass
+class PairwiseDataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
+    r"""Data collator for pairwise data."""
+
+    train_on_prompt: bool = False
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, torch.Tensor]:
+        r"""Pad batched data to the longest sequence in the batch."""
+        chosen_features = []
+        for feature in features:
+            chosen_features.append(
+                {
+                    "input_ids": feature["chosen_input_ids"],
+                    "attention_mask": feature["chosen_attention_mask"],
+                    "labels": feature["chosen_input_ids"] if self.train_on_prompt else feature["chosen_labels"],
+                    "images": feature["images"],
+                    "videos": feature["videos"],
+                    "audios": feature["audios"],
+                }
+            )
+
+        return super().__call__(chosen_features)
+
+
+def calculate_ppl(
+    model_name_or_path: str,
+    save_name: str = "ppl.json",
+    batch_size: int = 4,
+    stage: Literal["pt", "sft", "rm"] = "sft",
+    dataset: str = "alpaca_en_demo",
+    dataset_dir: str = "data",
+    template: str = "default",
+    cutoff_len: int = 2048,
+    max_samples: Optional[int] = None,
+    train_on_prompt: bool = False,
+):
+    r"""Calculate the ppl on the dataset of the pre-trained models.
+
+    Usage: export CUDA_VISIBLE_DEVICES=0
+    python cal_ppl.py --model_name_or_path path_to_model --dataset alpaca_en_demo --save_name ppl.json
+    """
+    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
+        dict(
+            stage=stage,
+            model_name_or_path=model_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=cutoff_len,
+            max_samples=max_samples,
+            train_on_prompt=train_on_prompt,
+            preprocessing_num_workers=16,
+            output_dir="dummy_dir",
+            overwrite_cache=True,
+            do_train=True,
+        )
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
+    model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False)
+    if stage == "pt":
+        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    elif stage == "sft":
+        data_collator = MultiModalDataCollatorForSeq2Seq(
+            template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX
+        )
+    elif stage == "rm":
+        data_collator = PairwiseDataCollatorWithPadding(
+            template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt
+        )
+    else:
+        raise NotImplementedError(f"Stage does not supported: {stage}.")
+
+    dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
+    criterion = torch.nn.CrossEntropyLoss(reduction="none")
+    total_ppl = 0
+    perplexities = []
+    batch: dict[str, torch.Tensor]
+    with torch.no_grad():
+        for batch in tqdm(dataloader, desc="Computing perplexities"):
+            batch = batch.to(model.device)
+            outputs = model(**batch)
+            shift_logits: torch.Tensor = outputs["logits"][..., :-1, :]
+            shift_labels: torch.Tensor = batch["labels"][..., 1:]
+            loss_mask = shift_labels != IGNORE_INDEX
+            flatten_logits = shift_logits.contiguous().view(shift_labels.size(0) * shift_labels.size(1), -1)
+            flatten_labels = shift_labels.contiguous().view(-1)
+            token_logps: torch.Tensor = criterion(flatten_logits, flatten_labels)
+            token_logps = token_logps.contiguous().view(shift_logits.size(0), -1)
+            sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+            total_ppl += sentence_logps.exp().sum().item()
+            perplexities.extend(sentence_logps.exp().tolist())
+
+    with open(save_name, "w", encoding="utf-8") as f:
+        json.dump(perplexities, f, indent=2)
+
+    print(f"Average perplexity is {total_ppl / len(perplexities):.2f}")
+    print(f"Perplexities have been saved at {save_name}.")
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_ppl)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/length_cdf.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/length_cdf.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+import fire
+from tqdm import tqdm
+
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_tokenizer
+
+
+def length_cdf(
+    model_name_or_path: str,
+    dataset: str = "alpaca_en_demo",
+    dataset_dir: str = "data",
+    template: str = "default",
+    interval: int = 1000,
+):
+    r"""Calculate the distribution of the input lengths in the dataset.
+
+    Usage: export CUDA_VISIBLE_DEVICES=0
+    python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
+    """
+    model_args, data_args, training_args, _, _ = get_train_args(
+        dict(
+            stage="sft",
+            model_name_or_path=model_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=1_000_000,
+            preprocessing_num_workers=16,
+            output_dir="dummy_dir",
+            overwrite_cache=True,
+            do_train=True,
+        )
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, "sft", **tokenizer_module)["train_dataset"]
+    total_num = len(trainset)
+    length_dict = defaultdict(int)
+    for sample in tqdm(trainset["input_ids"], desc="Collecting lengths"):
+        length_dict[len(sample) // interval * interval] += 1
+
+    length_tuples = list(length_dict.items())
+    length_tuples.sort()
+    count_accu, prob_accu = 0, 0
+    for length, count in length_tuples:
+        count_accu += count
+        prob_accu += count / total_num * 100
+        print(f"{count_accu:d} ({prob_accu:.2f}%) samples have length < {length + interval}.")
+
+
+if __name__ == "__main__":
+    fire.Fire(length_cdf)
--- a/docker-hub/qwen2.5-vl/llama-factory/scripts/vllm_infer.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/scripts/vllm_infer.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import json
+from typing import Optional
+
+import fire
+from tqdm import tqdm
+from transformers import Seq2SeqTrainingArguments
+
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.extras.misc import get_device_count
+from llamafactory.extras.packages import is_vllm_available
+from llamafactory.hparams import get_infer_args
+from llamafactory.model import load_tokenizer
+
+
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+    from vllm.lora.request import LoRARequest
+
+
+def vllm_infer(
+    model_name_or_path: str,
+    adapter_name_or_path: str = None,
+    dataset: str = "alpaca_en_demo",
+    dataset_dir: str = "data",
+    template: str = "default",
+    cutoff_len: int = 2048,
+    max_samples: Optional[int] = None,
+    vllm_config: str = "{}",
+    save_name: str = "generated_predictions.jsonl",
+    temperature: float = 0.95,
+    top_p: float = 0.7,
+    top_k: int = 50,
+    max_new_tokens: int = 1024,
+    repetition_penalty: float = 1.0,
+    skip_special_tokens: bool = True,
+    default_system: Optional[str] = None,
+    enable_thinking: bool = True,
+    seed: Optional[int] = None,
+    pipeline_parallel_size: int = 1,
+    image_max_pixels: int = 768 * 768,
+    image_min_pixels: int = 32 * 32,
+    video_fps: float = 2.0,
+    video_maxlen: int = 128,
+    batch_size: int = 1024,
+):
+    r"""Perform batch generation using vLLM engine, which supports tensor parallelism.
+
+    Usage: python vllm_infer.py --model_name_or_path meta-llama/Llama-2-7b-hf --template llama --dataset alpaca_en_demo
+    """
+    if pipeline_parallel_size > get_device_count():
+        raise ValueError("Pipeline parallel size should be smaller than the number of gpus.")
+
+    model_args, data_args, _, generating_args = get_infer_args(
+        dict(
+            model_name_or_path=model_name_or_path,
+            adapter_name_or_path=adapter_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=cutoff_len,
+            max_samples=max_samples,
+            preprocessing_num_workers=16,
+            default_system=default_system,
+            enable_thinking=enable_thinking,
+            vllm_config=vllm_config,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+        )
+    )
+
+    training_args = Seq2SeqTrainingArguments(output_dir="dummy_dir")
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)
+    template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate
+
+    engine_args = {
+        "model": model_args.model_name_or_path,
+        "trust_remote_code": True,
+        "dtype": model_args.infer_dtype,
+        "max_model_len": cutoff_len + max_new_tokens,
+        "tensor_parallel_size": (get_device_count() // pipeline_parallel_size) or 1,
+        "pipeline_parallel_size": pipeline_parallel_size,
+        "disable_log_stats": True,
+        "enable_lora": model_args.adapter_name_or_path is not None,
+    }
+    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
+        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}
+
+    if isinstance(model_args.vllm_config, dict):
+        engine_args.update(model_args.vllm_config)
+
+    llm = LLM(**engine_args)
+
+    # load datasets
+    dataset_module = get_dataset(template_obj, model_args, data_args, training_args, "ppo", **tokenizer_module)
+    train_dataset = dataset_module["train_dataset"]
+
+    sampling_params = SamplingParams(
+        repetition_penalty=generating_args.repetition_penalty or 1.0,  # repetition_penalty must > 0
+        temperature=generating_args.temperature,
+        top_p=generating_args.top_p or 1.0,  # top_p must > 0
+        top_k=generating_args.top_k or -1,  # top_k must > 0
+        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
+        max_tokens=generating_args.max_new_tokens,
+        skip_special_tokens=skip_special_tokens,
+        seed=seed,
+    )
+    if model_args.adapter_name_or_path is not None:
+        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
+    else:
+        lora_request = None
+
+    # Store all results in these lists
+    all_prompts, all_preds, all_labels = [], [], []
+
+    # Add batch process to avoid the issue of too many files opened
+    for i in tqdm(range(0, len(train_dataset), batch_size), desc="Processing batched inference"):
+        vllm_inputs, prompts, labels = [], [], []
+        batch = train_dataset[i : min(i + batch_size, len(train_dataset))]
+
+        for j in range(len(batch["input_ids"])):
+            if batch["images"][j] is not None:
+                image = batch["images"][j]
+                multi_modal_data = {
+                    "image": template_obj.mm_plugin._regularize_images(
+                        image, image_max_pixels=image_max_pixels, image_min_pixels=image_min_pixels
+                    )["images"]
+                }
+            elif batch["videos"][j] is not None:
+                video = batch["videos"][j]
+                multi_modal_data = {
+                    "video": template_obj.mm_plugin._regularize_videos(
+                        video,
+                        image_max_pixels=image_max_pixels,
+                        image_min_pixels=image_min_pixels,
+                        video_fps=video_fps,
+                        video_maxlen=video_maxlen,
+                    )["videos"]
+                }
+            elif batch["audios"][j] is not None:
+                audio = batch["audios"][j]
+                audio_data = template_obj.mm_plugin._regularize_audios(
+                    audio,
+                    sampling_rate=16000,
+                )
+                multi_modal_data = {"audio": zip(audio_data["audios"], audio_data["sampling_rates"])}
+            else:
+                multi_modal_data = None
+
+            vllm_inputs.append({"prompt_token_ids": batch["input_ids"][j], "multi_modal_data": multi_modal_data})
+            prompts.append(tokenizer.decode(batch["input_ids"][j], skip_special_tokens=skip_special_tokens))
+            labels.append(
+                tokenizer.decode(
+                    list(filter(lambda x: x != IGNORE_INDEX, batch["labels"][j])),
+                    skip_special_tokens=skip_special_tokens,
+                )
+            )
+
+        results = llm.generate(vllm_inputs, sampling_params, lora_request=lora_request)
+        preds = [result.outputs[0].text for result in results]
+
+        # Accumulate results
+        all_prompts.extend(prompts)
+        all_preds.extend(preds)
+        all_labels.extend(labels)
+        gc.collect()
+
+    # Write all results at once outside the loop
+    with open(save_name, "w", encoding="utf-8") as f:
+        for text, pred, label in zip(all_prompts, all_preds, all_labels):
+            f.write(json.dumps({"prompt": text, "predict": pred, "label": label}, ensure_ascii=False) + "\n")
+
+    print("*" * 70)
+    print(f"{len(all_prompts)} total generated results have been saved at {save_name}.")
+    print("*" * 70)
+
+
+if __name__ == "__main__":
+    fire.Fire(vllm_infer)
--- a/docker-hub/qwen2.5-vl/llama-factory/setup.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/setup.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+
+from setuptools import find_packages, setup
+
+
+def get_version() -> str:
+    with open(os.path.join("src", "llamafactory", "extras", "env.py"), encoding="utf-8") as f:
+        file_content = f.read()
+        pattern = r"{}\W*=\W*\"([^\"]+)\"".format("VERSION")
+        (version,) = re.findall(pattern, file_content)
+        return version
+
+
+def get_requires() -> list[str]:
+    with open("requirements.txt", encoding="utf-8") as f:
+        file_content = f.read()
+        lines = [line.strip() for line in file_content.strip().split("\n") if not line.startswith("#")]
+        return lines
+
+
+def get_console_scripts() -> list[str]:
+    console_scripts = ["llamafactory-cli = llamafactory.cli:main"]
+    if os.getenv("ENABLE_SHORT_CONSOLE", "1").lower() in ["true", "y", "1"]:
+        console_scripts.append("lmf = llamafactory.cli:main")
+
+    return console_scripts
+
+
+extra_require = {
+    "torch": ["torch>=2.0.0", "torchvision>=0.15.0"],
+    "torch-npu": ["torch-npu==2.5.1", "torchvision==0.20.1", "decorator"],
+    "metrics": ["nltk", "jieba", "rouge-chinese"],
+    "deepspeed": ["deepspeed>=0.10.0,<=0.16.9"],
+    "liger-kernel": ["liger-kernel>=0.5.5"],
+    "bitsandbytes": ["bitsandbytes>=0.39.0"],
+    "hqq": ["hqq"],
+    "eetq": ["eetq"],
+    "gptq": ["optimum>=1.24.0", "gptqmodel>=2.0.0"],
+    "aqlm": ["aqlm[gpu]>=1.1.0"],
+    "vllm": ["vllm>=0.4.3,<=0.9.1"],
+    "sglang": ["sglang[srt]>=0.4.5", "transformers==4.51.1"],
+    "galore": ["galore-torch"],
+    "apollo": ["apollo-torch"],
+    "badam": ["badam>=1.2.1"],
+    "adam-mini": ["adam-mini"],
+    "minicpm_v": [
+        "soundfile",
+        "torchvision",
+        "torchaudio",
+        "vector_quantize_pytorch",
+        "vocos",
+        "msgpack",
+        "referencing",
+        "jsonschema_specifications",
+    ],
+    "openmind": ["openmind"],
+    "swanlab": ["swanlab"],
+    "dev": ["pre-commit", "ruff", "pytest", "build"],
+}
+
+
+def main():
+    setup(
+        name="llamafactory",
+        version=get_version(),
+        author="hiyouga",
+        author_email="hiyouga@buaa.edu.cn",
+        description="Unified Efficient Fine-Tuning of 100+ LLMs",
+        long_description=open("README.md", encoding="utf-8").read(),
+        long_description_content_type="text/markdown",
+        keywords=["AI", "LLM", "GPT", "ChatGPT", "Llama", "Transformer", "DeepSeek", "Pytorch"],
+        license="Apache 2.0 License",
+        url="https://github.com/hiyouga/LLaMA-Factory",
+        package_dir={"": "src"},
+        packages=find_packages("src"),
+        python_requires=">=3.9.0",
+        install_requires=get_requires(),
+        extras_require=extra_require,
+        entry_points={"console_scripts": get_console_scripts()},
+        classifiers=[
+            "Development Status :: 4 - Beta",
+            "Intended Audience :: Developers",
+            "Intended Audience :: Education",
+            "Intended Audience :: Science/Research",
+            "License :: OSI Approved :: Apache Software License",
+            "Operating System :: OS Independent",
+            "Programming Language :: Python :: 3",
+            "Programming Language :: Python :: 3.9",
+            "Programming Language :: Python :: 3.10",
+            "Programming Language :: Python :: 3.11",
+            "Programming Language :: Python :: 3.12",
+            "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        ],
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/docker-hub/qwen2.5-vl/llama-factory/src/api.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/src/api.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import uvicorn
+
+from llamafactory.api.app import create_app
+from llamafactory.chat import ChatModel
+
+
+def main():
+    chat_model = ChatModel()
+    app = create_app(chat_model)
+    api_host = os.getenv("API_HOST", "0.0.0.0")
+    api_port = int(os.getenv("API_PORT", "8000"))
+    print(f"Visit http://localhost:{api_port}/docs for API document.")
+    uvicorn.run(app, host=api_host, port=api_port)
+
+
+if __name__ == "__main__":
+    main()
--- a/docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/__init__.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/__init__.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Efficient fine-tuning of large language models.
+
+Level:
+  api, webui > chat, eval, train > data, model > hparams > extras
+
+Disable version checking: DISABLE_VERSION_CHECK=1
+Enable VRAM recording: RECORD_VRAM=1
+Force using torchrun: FORCE_TORCHRUN=1
+Set logging verbosity: LLAMAFACTORY_VERBOSITY=WARN
+Use modelscope: USE_MODELSCOPE_HUB=1
+Use openmind: USE_OPENMIND_HUB=1
+"""
+
+from .extras.env import VERSION
+
+
+__version__ = VERSION
--- a/docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/__init__.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/__init__.py
--- a/docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/app.py
+++ b/docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/app.py
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from contextlib import asynccontextmanager
+from functools import partial
+from typing import Annotated, Optional
+
+from ..chat import ChatModel
+from ..extras.constants import EngineName
+from ..extras.misc import torch_gc
+from ..extras.packages import is_fastapi_available, is_starlette_available, is_uvicorn_available
+from .chat import (
+    create_chat_completion_response,
+    create_score_evaluation_response,
+    create_stream_chat_completion_response,
+)
+from .protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ModelCard,
+    ModelList,
+    ScoreEvaluationRequest,
+    ScoreEvaluationResponse,
+)
+
+
+if is_fastapi_available():
+    from fastapi import Depends, FastAPI, HTTPException, status
+    from fastapi.middleware.cors import CORSMiddleware
+    from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
+
+
+if is_starlette_available():
+    from sse_starlette import EventSourceResponse
+
+
+if is_uvicorn_available():
+    import uvicorn
+
+
+async def sweeper() -> None:
+    while True:
+        torch_gc()
+        await asyncio.sleep(300)
+
+
+@asynccontextmanager
+async def lifespan(app: "FastAPI", chat_model: "ChatModel"):  # collects GPU memory
+    if chat_model.engine.name == EngineName.HF:
+        asyncio.create_task(sweeper())
+
+    yield
+    torch_gc()
+
+
+def create_app(chat_model: "ChatModel") -> "FastAPI":
+    root_path = os.getenv("FASTAPI_ROOT_PATH", "")
+    app = FastAPI(lifespan=partial(lifespan, chat_model=chat_model), root_path=root_path)
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    api_key = os.getenv("API_KEY")
+    security = HTTPBearer(auto_error=False)
+
+    async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
+        if api_key and (auth is None or auth.credentials != api_key):
+            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
+
+    @app.get(
+        "/v1/models",
+        response_model=ModelList,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def list_models():
+        model_card = ModelCard(id=os.getenv("API_MODEL_NAME", "gpt-3.5-turbo"))
+        return ModelList(data=[model_card])
+
+    @app.post(
+        "/v1/chat/completions",
+        response_model=ChatCompletionResponse,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def create_chat_completion(request: ChatCompletionRequest):
+        if not chat_model.engine.can_generate:
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+
+        if request.stream:
+            generate = create_stream_chat_completion_response(request, chat_model)
+            return EventSourceResponse(generate, media_type="text/event-stream", sep="\n")
+        else:
+            return await create_chat_completion_response(request, chat_model)
+
+    @app.post(
+        "/v1/score/evaluation",
+        response_model=ScoreEvaluationResponse,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def create_score_evaluation(request: ScoreEvaluationRequest):
+        if chat_model.engine.can_generate:
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+
+        return await create_score_evaluation_response(request, chat_model)
+
+    return app
+
+
+def run_api() -> None:
+    chat_model = ChatModel()
+    app = create_app(chat_model)
+    api_host = os.getenv("API_HOST", "0.0.0.0")
+    api_port = int(os.getenv("API_PORT", "8000"))
+    print(f"Visit http://localhost:{api_port}/docs for API document.")
+    uvicorn.run(app, host=api_host, port=api_port)