Initial commit

53b3977b · dongchy920 · 53b3977b · 53b3977b · 53b3977b · 53b3977b
Commit 53b3977b authored Jul 11, 2025 by dongchy920
20 changed files
--- a/LLaMA-Factory/tests/model/test_freeze.py
+++ b/LLaMA-Factory/tests/model/test_freeze.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+from llamafactory.train.test_utils import load_infer_model, load_train_model
+
+
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "freeze",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "finetuning_type": "freeze",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def test_freeze_train_all_modules():
+    model = load_train_model(freeze_trainable_layers=1, **TRAIN_ARGS)
+    for name, param in model.named_parameters():
+        if name.startswith("model.layers.1."):
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        else:
+            assert param.requires_grad is False
+            assert param.dtype == torch.float16
+
+
+def test_freeze_train_extra_modules():
+    model = load_train_model(freeze_trainable_layers=1, freeze_extra_modules="embed_tokens,lm_head", **TRAIN_ARGS)
+    for name, param in model.named_parameters():
+        if name.startswith("model.layers.1.") or any(module in name for module in ["embed_tokens", "lm_head"]):
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        else:
+            assert param.requires_grad is False
+            assert param.dtype == torch.float16
+
+
+def test_freeze_inference():
+    model = load_infer_model(**INFER_ARGS)
+    for param in model.parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
--- a/LLaMA-Factory/tests/model/test_full.py
+++ b/LLaMA-Factory/tests/model/test_full.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+from llamafactory.train.test_utils import load_infer_model, load_train_model
+
+
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "full",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "finetuning_type": "full",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def test_full_train():
+    model = load_train_model(**TRAIN_ARGS)
+    for param in model.parameters():
+        assert param.requires_grad is True
+        assert param.dtype == torch.float32
+
+
+def test_full_inference():
+    model = load_infer_model(**INFER_ARGS)
+    for param in model.parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
--- a/LLaMA-Factory/tests/model/test_lora.py
+++ b/LLaMA-Factory/tests/model/test_lora.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+
+from llamafactory.train.test_utils import (
+    check_lora_model,
+    compare_model,
+    load_infer_model,
+    load_reference_model,
+    load_train_model,
+    patch_valuehead_model,
+)
+
+
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TINY_LLAMA_ADAPTER = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
+
+TINY_LLAMA_VALUEHEAD = os.getenv("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "adapter_name_or_path": TINY_LLAMA_ADAPTER,
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+@pytest.fixture
+def fix_valuehead_cpu_loading():
+    patch_valuehead_model()
+
+
+def test_lora_train_qv_modules():
+    model = load_train_model(lora_target="q_proj,v_proj", **TRAIN_ARGS)
+    linear_modules, _ = check_lora_model(model)
+    assert linear_modules == {"q_proj", "v_proj"}
+
+
+def test_lora_train_all_modules():
+    model = load_train_model(lora_target="all", **TRAIN_ARGS)
+    linear_modules, _ = check_lora_model(model)
+    assert linear_modules == {"q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"}
+
+
+def test_lora_train_extra_modules():
+    model = load_train_model(additional_target="embed_tokens,lm_head", **TRAIN_ARGS)
+    _, extra_modules = check_lora_model(model)
+    assert extra_modules == {"embed_tokens", "lm_head"}
+
+
+def test_lora_train_old_adapters():
+    model = load_train_model(adapter_name_or_path=TINY_LLAMA_ADAPTER, create_new_adapter=False, **TRAIN_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA, TINY_LLAMA_ADAPTER, use_lora=True, is_trainable=True)
+    compare_model(model, ref_model)
+
+
+def test_lora_train_new_adapters():
+    model = load_train_model(adapter_name_or_path=TINY_LLAMA_ADAPTER, create_new_adapter=True, **TRAIN_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA, TINY_LLAMA_ADAPTER, use_lora=True, is_trainable=True)
+    compare_model(
+        model, ref_model, diff_keys=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"]
+    )
+
+
+@pytest.mark.usefixtures("fix_valuehead_cpu_loading")
+def test_lora_train_valuehead():
+    model = load_train_model(add_valuehead=True, **TRAIN_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA_VALUEHEAD, is_trainable=True, add_valuehead=True)
+    state_dict = model.state_dict()
+    ref_state_dict = ref_model.state_dict()
+    assert torch.allclose(state_dict["v_head.summary.weight"], ref_state_dict["v_head.summary.weight"])
+    assert torch.allclose(state_dict["v_head.summary.bias"], ref_state_dict["v_head.summary.bias"])
+
+
+def test_lora_inference():
+    model = load_infer_model(**INFER_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA, TINY_LLAMA_ADAPTER, use_lora=True).merge_and_unload()
+    compare_model(model, ref_model)
--- a/LLaMA-Factory/tests/model/test_pissa.py
+++ b/LLaMA-Factory/tests/model/test_pissa.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from llamafactory.train.test_utils import compare_model, load_infer_model, load_reference_model, load_train_model
+
+
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TINY_LLAMA_PISSA = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-pissa")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "pissa_init": True,
+    "pissa_iter": -1,
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA_PISSA,
+    "adapter_name_or_path": TINY_LLAMA_PISSA,
+    "adapter_folder": "pissa_init",
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+OS_NAME = os.getenv("OS_NAME", "")
+
+
+@pytest.mark.xfail(reason="PiSSA initialization is not stable in different platform.")
+def test_pissa_train():
+    model = load_train_model(**TRAIN_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA_PISSA, TINY_LLAMA_PISSA, use_pissa=True, is_trainable=True)
+    compare_model(model, ref_model)
+
+
+@pytest.mark.xfail(OS_NAME.startswith("windows"), reason="Known connection error on Windows.")
+def test_pissa_inference():
+    model = load_infer_model(**INFER_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA_PISSA, TINY_LLAMA_PISSA, use_pissa=True, is_trainable=False)
+    ref_model = ref_model.merge_and_unload()
+    compare_model(model, ref_model)
--- a/LLaMA-Factory/tests/train/test_sft_trainer.py
+++ b/LLaMA-Factory/tests/train/test_sft_trainer.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+import pytest
+from transformers import DataCollatorWithPadding
+
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_model, load_tokenizer
+from llamafactory.train.sft.trainer import CustomSeq2SeqTrainer
+
+
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": False,
+    "overwrite_output_dir": True,
+    "per_device_train_batch_size": 1,
+    "max_steps": 1,
+}
+
+
+@dataclass
+class DataCollatorWithVerbose(DataCollatorWithPadding):
+    verbose_list: List[Dict[str, Any]] = field(default_factory=list)
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        self.verbose_list.extend(features)
+        batch = super().__call__(features)
+        return {k: v[:, :1] for k, v in batch.items()}  # truncate input length
+
+
+@pytest.mark.parametrize("disable_shuffling", [False, True])
+def test_shuffle(disable_shuffling: bool):
+    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
+        {
+            "output_dir": os.path.join("output", f"shuffle{str(disable_shuffling).lower()}"),
+            "disable_shuffling": disable_shuffling,
+            **TRAIN_ARGS,
+        }
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+    data_collator = DataCollatorWithVerbose(tokenizer=tokenizer)
+    trainer = CustomSeq2SeqTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        **dataset_module,
+        **tokenizer_module,
+    )
+    trainer.train()
+    if disable_shuffling:
+        assert data_collator.verbose_list[0]["input_ids"] == dataset_module["train_dataset"][0]["input_ids"]
+    else:
+        assert data_collator.verbose_list[0]["input_ids"] != dataset_module["train_dataset"][0]["input_ids"]
--- a/README.md
+++ b/README.md
+# Qwen2.5-Coder
+## 论文
+- https://arxiv.org/pdf/2409.12186
+## 模型结构
+Qwen2.5-Coder 基于 Qwen 系列的大模型（通义千问），针对代码领域任务进行了特殊优化。它可以在多种编程语言（如 Python、Java、C++ 等）中生成和解释代码
+<div align=center>
+    <img src="./imgs/qwen2.5.png"/>
+</div>
+
+## 算法原理
+**Architecture**  
+Qwen2.5-Coder以Qwen2.5为基础，有0.5B、1.5B、3B、7B、14B以及32B六个不同的模型尺寸，不同的尺寸的Head Size都为128，尺寸的变化主要体现在Hidden Size、Layers等结构尺寸的调整，具体的模型尺寸和结构如表所示：  
+
+<div align=center>
+    <img src="./imgs/qwen2.5-coder.png"/>
+</div> 
+
+**Tokenization**  
+在Qwen2.5词汇表的基础上，Qwen2.5-Coder增加了一些特殊的tokens帮助模型更好的理解代码的各种格式，比如说，<|endoftext|>标志一段text或者说段落的结束，<|fim_prefix|>,<|fim_middle|>, 和<|fim_suffix|> 用于实现Fill-in-the-Middle（FIM）技术，帮助模型预测代码缺失的block，<|fim_pad|>用于在FIM中对代码片段进行补全，<|repo_name|>和<|file_sep|>分别为Repo代码仓库名称和Repo之间的划分标志：  
+<div align=center>
+    <img src="./imgs/tokens.png"/>
+</div> 
+
+**Data**  
+使用了一个自建的数据集Qwen2.5-Coder-Data用于训练 Qwen2.5-Coder，包括5个部分：  
+- Source Code Data： 源代码数据集，从GitHub的2024年2月以前的公开Repo中进行收集，包括92种语言
+- Text-Code Grounding Data： 文本代码混合数据集，主要是一些技术博客、代码相关文档、教程等
+- Synthetic Data： 合成数据集，使用CodeQwen1.5生成
+- Math Data：数学数据集，使用Qwen2.5-Math的数据集。提升模型的数学推理能力
+- Text Data：文本数据集，使用Qwen2.5的数据集，提升模型的通用能力  
+对5种类型的数据进行混合来训练一个foundational model，Qwen2.5-Coder尝试了多种混合比例，最后选择的混合比例为：Code:Text:Math=7：2：1，共5.2T的Tokens：  
+
+<div align=center>
+    <img src="./imgs/Mixdata.png"/>
+</div> 
+
+**Training Policy**  
+使用了三阶段的训练方法，以Qwen2.5作为BaseLine，第一阶段进行文本级别的训练，也就是上述的混合数据集共5.2T的Tokens，第二阶段是进行仓库级别的训练，增强模型的长文本处理能力，共300B的Tokens，通过两个阶段的训练，得到了Qwen2.5-Coder-Base模型； 
+为了更适合进行交互，理解用户指令，在Qwen2.5-Coder-32B-Base模型的基础上，进行指令微调，得到了Instruct版本。为此，项目还构建了一个用于指令微调的合成数据集，具体做法是使用GitHub的代码片段通过LLM生成对应的指令，然后使用LLM Score进行过滤掉低质量样本，构建了合成的代码指令数据集；除了合成数据集，还使用了一些开源的指令数据集，比如McEval-Instruct。具体的训练流程如图：  
+<div align=center>
+    <img src="./imgs/Training.png"/>
+</div> 
+
+## 环境配置
+### Docker（方法一）
+从[光源](https://www.sourcefind.cn/#/service-list)中拉取docker镜像：
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04.1-py3.10
+```
+创建容器并挂载目录进行开发：
+```
+docker run -it --name {name} --shm-size=1024G  --device=/dev/kfd --device=/dev/dri/ --privileged --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal:ro -v {}:{} {docker_image} /bin/bash
+# 修改1 {name} 需要改为自定义名称  
+# 修改2 {docker_image} 需要需要创建容器的对应镜像名称  
+# 修改3 -v 挂载路径到容器指定路径
+pip install -r requirements.txt
+cd LLaMA-Factory
+pip install -e ".[torch,metrics]"
+pip install deepspeed-0.14.2+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
+```
+### Dockerfile（方法二）
+```
+cd docker
+docker build --no-cache -t qwen2_5_coder:1.0 .
+docker run -it --name {name} --shm-size=1024G  --device=/dev/kfd --device=/dev/dri/ --privileged --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal:ro -v {}:{} {docker_image} /bin/bash 
+pip install -r requirements.txt
+```
+### Anaconda（方法三）
+线上节点推荐使用conda进行环境配置。
+创建python=3.10的conda环境并激活
+```
+conda create -n qwen2_5_coder python=3.10
+conda activate qwen2_5_coder
+```
+
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+```
+DTK驱动：dtk25.04.1
+python：python3.10
+pytorch:2.4.1
+torchvision:0.19.1
+deepspeed:0.14.2
+```
+安装其他依赖包
+```
+pip install -r requirements.txt
+cd LLaMA-Factory
+pip install -e ".[torch,metrics]"
+pip install deepspeed-0.14.2+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
+```
+
+下载预训练权重文件并解压，这里以Qwen2.5-Coder-32B-Instruct为例，其他版本模型可以在**其他预训练权重**部分下载：  
+Qwen2.5-Coder-32B-Instruct预训练权重下载：[HuggingFace下载](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)    
+SCNet快速下载连接[SCNet下载](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-32B-Instruct)  
+将文件夹Qwen/Qwen2.5-Coder-32B-Instruct保存至LLaMA-Factory中
+## 数据集 
+无
+
+## 训练
+使用LLaMA-Factory框架微调
+### 单机单卡（LoRA-finetune）
+```
+# 注意：根据自己的模型切换.yaml文件中的模型位置
+HIP_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_lora/qwen2vl_lora_sft_custom.yaml
+# 如果显存不够，可以使用Zero-offload减少显存使用
+HIP_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_lora/qwen2vl_lora_sft_offload_custom.yaml
+```
+
+### 单机多卡（LoRA-finetune）
+```
+# 注意：根据自己的模型切换.yaml文件中的模型位置
+HIP_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/train_lora/qwen2vl_lora_sft_offload_custom.yaml
+```
+
+## 推理
+```
+# 注意：根据自己的模型切换.yaml文件中的模型位置
+HIP_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/qwen2_coder_custom.yaml
+```
+
+## result
+<div align=center>
+    <img src="./imgs/inference.png"/>
+</div> 
+
+## 精度
+Qwen2.5-Coder-32B-Instruct模型与当前一些开源代码大模型的代码推理能力（MdEval）对比：  
+<div align=center>
+    <img src="./imgs/eval.png"/>
+</div> 
+
+## 应用场景
+### 算法类别
+代码生成
+
+### 热点应用行业
+代码生成,代码修改,教育
+
+## 其他预训练权重
+Instruct系列模型  
+[Qwen2.5-Coder-0.5B-Instruct](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-0.5B-Instruct)  
+[Qwen2.5-Coder-1.5B-Instruct](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-1.5B-Instruct)  
+[Qwen2.5-Coder-3B-Instruct](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-3B-Instruct)  
+[Qwen2.5-Coder-7B-Instruct](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-7B-Instruct)  
+[Qwen2.5-Coder-14B-Instruct](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-14B-Instruct)  
+Base系列模型  
+[Qwen2.5-Coder-0.5B](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-0.5B)  
+[Qwen2.5-Coder-1.5B](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-1.5B)   
+[Qwen2.5-Coder-3B](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-3B)  
+[Qwen2.5-Coder-7B](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-7B)  
+[Qwen2.5-Coder-14B](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-14B)  
+[Qwen2.5-Coder-32B](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Coder-32B)  
+## 源码仓库及问题反馈
+[https://developer.sourcefind.cn/codes/dongchy920/qwen2.5_coder](https://developer.sourcefind.cn/codes/dongchy920/qwen2.5_coder)
+## 参考资料
+[https://github.com/QwenLM/Qwen2.5-Coder](https://github.com/QwenLM/Qwen2.5-Coder)  
+[https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)  
+
+
--- a/demo/artifacts/app.css
+++ b/demo/artifacts/app.css
+.left_header {
+  display: flex;
+  flex-direction: column;
+  justify-content: center;
+  align-items: center;
+}
+
+.right_panel {
+  margin-top: 66px;
+  border: 1px solid #BFBFC4;
+  border-radius: 8px;
+  overflow: hidden;;
+}
+
+.render_header {
+  height: 30px;
+  width: 100%;
+  padding: 5px 16px;
+  background-color: #f5f5f5;
+}
+
+.header_btn {
+  display: inline-block;
+  height: 10px;
+  width: 10px;
+  border-radius: 50%;
+  margin-right: 4px;
+}
+
+.render_header > .header_btn:nth-child(1) {
+  background-color: #f5222d;
+}
+
+.render_header > .header_btn:nth-child(2) {
+  background-color: #faad14;
+}
+.render_header > .header_btn:nth-child(3) {
+  background-color: #52c41a;
+}
+
+.right_content {
+  height: 920px; 
+  display: flex; 
+  flex-direction: column;
+  justify-content: center;
+  align-items: center;
+}
+
+.history_chatbot button {
+  background: none;
+  border: none;
+}
+
+.html_content {
+  width: 100%;
+  height: 920px;
+}
\ No newline at end of file
--- a/demo/artifacts/app.py
+++ b/demo/artifacts/app.py
+import os
+import re
+from http import HTTPStatus
+from typing import Dict, List, Optional, Tuple
+import base64
+
+
+import dashscope
+import gradio as gr
+from dashscope import Generation
+from dashscope.api_entities.dashscope_response import Role
+
+import modelscope_studio.components.base as ms
+import modelscope_studio.components.legacy as legacy
+import modelscope_studio.components.antd as antd
+from config import DEMO_LIST, SystemPrompt
+
+YOUR_API_TOKEN = os.getenv('YOUR_API_TOKEN')
+dashscope.api_key = YOUR_API_TOKEN
+
+History = List[Tuple[str, str]]
+Messages = List[Dict[str, str]]
+
+def history_to_messages(history: History, system: str) -> Messages:
+    messages = [{'role': Role.SYSTEM, 'content': system}]
+    for h in history:
+        messages.append({'role': Role.USER, 'content': h[0]})
+        messages.append({'role': Role.ASSISTANT, 'content': h[1]})
+    return messages
+
+
+def messages_to_history(messages: Messages) -> Tuple[str, History]:
+    assert messages[0]['role'] == Role.SYSTEM
+    history = []
+    for q, r in zip(messages[1::2], messages[2::2]):
+        history.append([q['content'], r['content']])
+    return history
+
+
+def remove_code_block(text):
+    pattern = r'```html\n(.+?)\n```'
+    match = re.search(pattern, text, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    else:
+        return text.strip()
+
+def history_render(history: History):
+    return gr.update(open=True), history
+
+def clear_history():
+    return []
+
+def send_to_sandbox(code):
+    encoded_html = base64.b64encode(code.encode('utf-8')).decode('utf-8')
+    data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
+    return f"<iframe src=\"{data_uri}\" width=\"100%\" height=\"920px\"></iframe>"
+    # return {
+    #     '/src/App.jsx': {
+    #         'code': code,
+    #         'fpath': '/src/App.jsx',
+    #     },
+    #     # 以路径为 key，必须以绝对路径来描述
+    #     '/src/index.js': {
+    #         'code':
+    #         'import React from "react"; import ReactDOM from "react-dom"; import App from "./App"; const rootElement = document.getElementById("root"); ReactDOM.render(<App />, rootElement);',
+    #         'fpath': '/src/index.js',
+    #     },
+    #     '/package.json': {
+    #         'code': '{"name":"demo", "main": "./src/index.js", "dependencies":{ "react": "18.3.1", "react-dom": "18.3.1", "antd": "5.21.6", "styled-components": "6.1.13" }}',
+    #         'fpath': '/package.json',
+    #     },
+    # }
+
+def demo_card_click(e: gr.EventData):
+    index = e._data['component']['index']
+    return DEMO_LIST[index]['description']
+
+with gr.Blocks(css_paths="app.css") as demo:
+    history = gr.State([])
+    setting = gr.State({
+        "system": SystemPrompt,
+    })
+
+    with ms.Application() as app:
+        with antd.ConfigProvider():
+            with antd.Row(gutter=[32, 12]) as layout:
+                with antd.Col(span=24, md=8):
+                    with antd.Flex(vertical=True, gap="middle", wrap=True):
+                        header = gr.HTML("""
+                                  <div class="left_header">
+                                   <img src="//img.alicdn.com/imgextra/i2/O1CN01KDhOma1DUo8oa7OIU_!!6000000000220-1-tps-240-240.gif" width="200px" />
+                                   <h1>Qwen2.5-Coder</h2>
+                                  </div>
+                                   """)
+                        input = antd.InputTextarea(
+                            size="large", allow_clear=True, placeholder="Please enter what kind of application you want")
+                        # input = gr.TextArea(placeholder="请输入您想要一个什么样的应用", show_label=False, container=False)
+                        btn = antd.Button("send", type="primary", size="large")
+                        clear_btn = antd.Button("clear history", type="default", size="large")
+
+                        antd.Divider("examples")
+                        with antd.Flex(gap="small", wrap=True):
+                            with ms.Each(DEMO_LIST):
+                              with antd.Card(hoverable=True, as_item="card") as demoCard:
+                                antd.CardMeta()
+                              demoCard.click(demo_card_click, outputs=[input])
+
+                        antd.Divider("setting")
+
+                        with antd.Flex(gap="small", wrap=True):
+                            settingPromptBtn = antd.Button(
+                                "⚙️ set system Prompt", type="default")
+                            codeBtn = antd.Button("🧑‍💻 view code", type="default")
+                            historyBtn = antd.Button("📜 history", type="default")
+
+                    with antd.Modal(open=False, title="set system Prompt", width="800px") as system_prompt_modal:
+                        systemPromptInput = antd.InputTextarea(
+                            SystemPrompt, auto_size=True)
+
+                    settingPromptBtn.click(lambda: gr.update(
+                        open=True), inputs=[], outputs=[system_prompt_modal])
+                    system_prompt_modal.ok(lambda input: ({"system": input}, gr.update(
+                        open=False)), inputs=[systemPromptInput], outputs=[setting, system_prompt_modal])
+                    system_prompt_modal.cancel(lambda: gr.update(
+                        open=False), outputs=[system_prompt_modal])
+
+                    with antd.Drawer(open=False, title="code", placement="left", width="750px") as code_drawer:
+                        code_output = legacy.Markdown()
+
+                    codeBtn.click(lambda: gr.update(open=True),
+                                  inputs=[], outputs=[code_drawer])
+                    code_drawer.close(lambda: gr.update(
+                        open=False), inputs=[], outputs=[code_drawer])
+
+                    with antd.Drawer(open=False, title="history", placement="left", width="900px") as history_drawer:
+                        history_output = legacy.Chatbot(show_label=False, flushing=False, height=960, elem_classes="history_chatbot")
+
+                    historyBtn.click(history_render, inputs=[history], outputs=[history_drawer, history_output])
+                    history_drawer.close(lambda: gr.update(
+                        open=False), inputs=[], outputs=[history_drawer])
+
+                with antd.Col(span=24, md=16):
+                    with ms.Div(elem_classes="right_panel"):
+                        gr.HTML('<div class="render_header"><span class="header_btn"></span><span class="header_btn"></span><span class="header_btn"></span></div>')
+                        with antd.Tabs(active_key="empty", render_tab_bar="() => null") as state_tab:
+                            with antd.Tabs.Item(key="empty"):
+                                empty = antd.Empty(description="empty input", elem_classes="right_content")
+                                with antd.Tabs.Item(key="loading"):
+                                    loading = antd.Spin(True, tip="coding...", size="large", elem_classes="right_content")
+                                with antd.Tabs.Item(key="render"):
+                                    sandbox = gr.HTML(elem_classes="html_content")
+                                # sandbox = pro.FrontendCodeSandbox(elem_style={
+                                #   'height': '920px',
+                                #   'width': '100%'
+                                # })
+
+            def generation_code(query: Optional[str], _setting: Dict[str, str], _history: Optional[History]):
+              if query is None:
+                  query = ''
+              if _history is None:
+                  _history = []
+              messages = history_to_messages(_history, _setting['system'])
+              messages.append({'role': Role.USER, 'content': query})
+
+              gen = Generation.call(model="qwen2.5-coder-32b-instruct",
+                                    messages=messages,
+                                    result_format='message',
+                                    stream=True)
+              for response in gen:
+                  if response.status_code == HTTPStatus.OK:
+                      role = response.output.choices[0].message.role
+                      content = response.output.choices[0].message.content
+                      if response.output.choices[0].finish_reason == 'stop':
+                        _history = messages_to_history(messages + [{
+                            'role': role,
+                            'content': content
+                        }])
+                        print('history')
+                        print(_history)
+                        yield {
+                            code_output: content,
+                            history: _history,
+                            sandbox: send_to_sandbox(remove_code_block(content)),
+                            state_tab:  gr.update(active_key="render"),
+                            code_drawer:  gr.update(open=False),
+                        }
+                      else:
+                        yield {
+                            code_output: content,
+                            state_tab:  gr.update(active_key="loading"),
+                            code_drawer: gr.update(open=True),
+                        }                      
+                  else:
+                      raise ValueError(
+                          'Request id: %s, Status code: %s, error code: %s, error message: %s'
+                          % (response.request_id, response.status_code, response.code,
+                            response.message))
+
+            btn.click(generation_code,
+                      inputs=[input, setting, history],
+                      outputs=[code_output, history, sandbox, state_tab, code_drawer])
+            
+            clear_btn.click(clear_history, inputs=[], outputs=[history])
+
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=20).launch(ssr_mode=False)
--- a/demo/artifacts/config.py
+++ b/demo/artifacts/config.py
+# SystemPrompt = """You are an expert React, JavaScript, and Ant-Design Components developer with a keen eye for modern, aesthetically pleasing design.
+# Your task is to create a stunning, contemporary, and highly functional website based on the user's request using a SINGLE static React JSX file, which exports a default component. 
+# This code will go directly into the App.jsx file and will be used to render the website.
+# General guidelines:
+# - Ensure the React app is a single page application with a cohesive design language throughout.
+# - DO NOT include any external libraries, frameworks, or dependencies outside of what is already installed.
+# - For icons, create simple, elegant SVG icons. DO NOT use any icon libraries.
+# - Use styled-components to add any style, DO NOT return any extra css file.
+# - Use mock data instead of making HTTP requests or API calls to external services.
+# - Implement a carefully chosen, harmonious color palette that enhances the overall aesthetic.
+# - Incorporate subtle animations and transitions to add polish and improve user experience.
+# - Ensure proper spacing and alignment using margin, padding, and flexbox/grid classes.
+# - Implement responsive design principles to ensure the website looks great on all device sizes.
+# - Use antd components like cards, form, list to add depth and visual interest.
+# - Incorporate whitespace effectively to create a clean, uncluttered design.
+# Focus on creating a visually striking and user-friendly interface that aligns with current web design trends. Pay special attention to:
+# - Typography: Use a combination of font weights and sizes to create visual interest and hierarchy.
+# - Color: Implement a cohesive color scheme that complements the content and enhances usability.
+# - Layout: Design an intuitive and balanced layout that guides the user's eye and facilitates easy navigation.
+# - Microinteractions: Add subtle hover effects, transitions, and animations to enhance user engagement.
+# - Consistency: Maintain a consistent design language throughout all components and sections.
+# Remember to only return code for the App.jsx file and nothing else. Prioritize creating an exceptional layout, styling, and reactivity. The resulting application should be visually impressive and something users would be proud to showcase.
+# Remember not add any description, just return the code only.
+# """
+SystemPrompt = """
+You are a web development engineer, writing web pages according to the instructions below. You are a powerful code editing assistant capable of writing code and creating artifacts in conversations with users, or modifying and updating existing artifacts as requested by users. 
+All code is written in a single code block to form a complete code file for display, without separating HTML and JavaScript code. An artifact refers to a runnable complete code snippet, you prefer to integrate and output such complete runnable code rather than breaking it down into several code blocks. For certain types of code, they can render graphical interfaces in a UI window. After generation, please check the code execution again to ensure there are no errors in the output.
+
+Output only the HTML, without any additional descriptive text.
+"""
+
+DEMO_LIST = [
+  {
+    "card": {
+      "index": 0,
+    },
+    "title": "Qwen，Start！",
+    "description": "Help me design an interface with a purple button that says 'Qwen, Start!'. When the button is clicked, display a countdown from 5 in a very large font for 5 seconds.",
+  },
+  {
+    "card": {
+      "index": 1,
+    },
+    "title": "Spam with emojis!",
+    "description": "Write code in a single HTML file: Capture the click event, place a random number of emojis at the click position, and add gravity and collision effects to each emoji."
+  },
+  {
+    "card": {
+      "index": 2,
+    },
+    "title": "TODO list",
+    "description": "I want a TODO list that allows me to add tasks, delete tasks, and I would like the overall color theme to be purple."
+  },
+]
\ No newline at end of file
--- a/demo/artifacts/requirements.txt
+++ b/demo/artifacts/requirements.txt
+dashscope
+modelscope_studio~=1.0.0b
\ No newline at end of file
--- a/demo/chatbot/app.py
+++ b/demo/chatbot/app.py
+# app.py
+
+import os
+from http.client import HTTPMessage
+
+os.system('pip install dashscope')
+
+import gradio as gr
+from http import HTTPStatus
+import dashscope
+from dashscope import Generation
+from dashscope.api_entities.dashscope_response import Role
+from typing import List, Optional, Tuple, Dict
+from urllib.error import HTTPError
+
+default_system = 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
+
+YOUR_API_TOKEN = os.getenv('YOUR_API_TOKEN')
+dashscope.api_key = YOUR_API_TOKEN
+
+History = List[Tuple[str, str]]
+Messages = List[Dict[str, str]]
+
+def clear_session() -> History:
+    return '', []
+
+def modify_system_session(system: str) -> str:
+    if system is None or len(system) == 0:
+        system = default_system
+    return system, system, []
+
+def history_to_messages(history: History, system: str) -> Messages:
+    messages = [{'role': Role.SYSTEM, 'content': system}]
+    for h in history:
+        messages.append({'role': Role.USER, 'content': h[0]})
+        messages.append({'role': Role.ASSISTANT, 'content': h[1]})
+    return messages
+
+
+def messages_to_history(messages: Messages) -> Tuple[str, History]:
+    assert messages[0]['role'] == Role.SYSTEM
+    system = messages[0]['content']
+    history = []
+    for q, r in zip(messages[1::2], messages[2::2]):
+        history.append([q['content'], r['content']])
+    return system, history
+
+
+def model_chat(query: Optional[str], history: Optional[History], system: str,
+               temperature: float, top_p: float, max_length: int) -> Tuple[str, str, History]:
+    if query is None:
+        query = ''
+    if history is None:
+        history = []
+    messages = history_to_messages(history, system)
+    messages.append({'role': Role.USER, 'content': query})
+    gen = Generation.call(
+        model="qwen2.5-coder-32b-instruct",
+        messages=messages,
+        result_format='message',
+        stream=True,
+        temperature=temperature,
+        top_p=top_p,
+        max_length=max_length
+    )
+    for response in gen:
+        if response.status_code == HTTPStatus.OK:
+            role = response.output.choices[0].message.role
+            response = response.output.choices[0].message.content
+            system, history = messages_to_history(messages + [{'role': role, 'content': response}])
+            yield '', history, system
+        else:
+            raise HTTPError( code=404, msg='Request id: %s, Status code: %s, error code: %s, error message: %s' % (
+                response.request_id, response.status_code,
+                response.code, response.message), hdrs=HTTPMessage(), url='http://example.com', fp=None)
+
+
+
+
+def chiose_radio(radio, system):
+    mark_ = gr.Markdown(value=f"<center><font size=8>Qwen2.5-Coder-{radio}-instruct👾</center>")
+    chatbot = gr.Chatbot(label=f'Qwen2.5-Coder-{radio.lower()}-instruct')
+    
+    if system is None or len(system) == 0:
+        system = default_system
+    
+    return mark_, chatbot, system, system, ""
+
+
+def update_other_radios(value, other_radio1, other_radio2):
+    if value == "":
+        if other_radio1 != "":
+            selected = other_radio1
+        else:
+            selected = other_radio2
+        return selected, other_radio1, other_radio2
+    return value, "", ""
+
+
+def main():
+    # 创建两个标签
+    with gr.Blocks() as demo:
+        with gr.Row():
+            options_coder = ["0.5B", "1.5B", "3B", "7B", "14B", "32B",]
+            with gr.Row():
+                radio = gr.Radio(choices=options_coder, label="Qwen2.5-Coder：", value="32B")
+                
+        with gr.Row():
+            with gr.Accordion():
+                mark_ = gr.Markdown("""<center><font size=8>Qwen2.5-Coder-32B-Instruct Bot👾</center>""")
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        system_input = gr.Textbox(value=default_system, lines=1, label='System')
+                    with gr.Column(scale=1):
+                        modify_system = gr.Button("🛠️ Set system prompt and clear history", scale=2)
+                    system_state = gr.Textbox(value=default_system, visible=False)
+                chatbot = gr.Chatbot(label='Qwen2.5-Coder-32B-Instruct')
+                textbox = gr.Textbox(lines=1, label='Input')
+                
+                with gr.Row():
+                    clear_history = gr.Button("🧹 Clear History")
+                    sumbit = gr.Button("🚀 Send")
+                
+                with gr.Accordion("Parameters", open=False):
+                    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.1, label="Temperature")
+                    top_p = gr.Slider(minimum=0.6, maximum=1.0, value=0.9, step=0.05, label="Top P")
+                    max_length = gr.Slider(minimum=512, maximum=8192, value=2048, step=128, label="Max Length")
+
+                textbox.submit(model_chat,
+                            inputs=[textbox, chatbot, system_state, temperature, top_p, max_length],
+                            outputs=[textbox, chatbot, system_input])
+                sumbit.click(model_chat,
+                             inputs=[textbox, chatbot, system_state, temperature, top_p, max_length],
+                             outputs=[textbox, chatbot, system_input],
+                             concurrency_limit=100)
+                clear_history.click(fn=clear_session,
+                                    inputs=[],
+                                    outputs=[textbox, chatbot])
+                modify_system.click(fn=modify_system_session,
+                                    inputs=[system_input],
+                                    outputs=[system_state, system_input, chatbot])
+        
+        radio.change(chiose_radio,
+                     inputs=[radio, system_input],
+                     outputs=[mark_, chatbot, system_state, system_input, textbox])
+    
+    demo.queue(api_open=False, default_concurrency_limit=40)
+    demo.launch(max_threads=5)
+
+
+if __name__ == "__main__":
+    main()
--- a/docker/dockerfile
+++ b/docker/dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04.1-py3.10
+RUN source /opt/dtk/env.sh
\ No newline at end of file
--- a/examples/Qwen2.5-Coder-Instruct-stream.py
+++ b/examples/Qwen2.5-Coder-Instruct-stream.py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import TextIteratorStreamer
+from threading import Thread
+
+device = "cuda" # the device to load the model onto
+
+# Now you do not need to add "trust_remote_code=True"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct", device_map="auto").eval()
+
+# Instead of using model.chat(), we directly use model.generate()
+# But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
+prompt = "write a quick sort algorithm."
+messages = [
+    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(device)
+
+streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+generation_kwargs = dict(inputs=model_inputs.input_ids, streamer=streamer, max_new_tokens=2048)
+thread = Thread(target=model.generate, kwargs=generation_kwargs)
+
+thread.start()
+generated_text = ""
+for new_text in streamer:
+    generated_text += new_text
+    print(new_text, end="")
+print(generated_text)
--- a/examples/Qwen2.5-Coder-Instruct.md
+++ b/examples/Qwen2.5-Coder-Instruct.md
+# Use Qwen2.5-Coder-32B-Instruct By transformers
+The most significant but also the simplest usage of Qwen2.5-Coder-32B-Instruct is using the `transformers` library. In this document, we show how to chat with Qwen2.5-Coder-32B-Instruct in either streaming mode or not.
+
+## Basic Usage
+You can just write several lines of code with `transformers` to chat with Qwen2.5-Coder-32B-Instruct. Essentially, we build the tokenizer and the model with `from_pretrained` method, and we use generate method to perform chatting with the help of chat template provided by the tokenizer. Below is an example of how to chat with Qwen2.5-Coder-32B-Instruct:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+prompt = "write a quick sort algorithm."
+messages = [
+    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=512
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+```
+
+The `apply_chat_template()` function is used to convert the messages into a format that the model can understand. 
+The `add_generation_prompt` argument is used to add a generation prompt, which refers to `<|im_start|>assistant\n` to the input. Notably, we apply ChatML template for chat models following our previous practice. 
+The `max_new_tokens` argument is used to set the maximum length of the response. The `tokenizer.batch_decode()` function is used to decode the response. In terms of the input, the above messages is an example to show how to format your dialog history and system prompt.
+
+## Processing Long Texts
+The current `config.json` is set for context length up to 32,768 tokens.
+To handle extensive inputs exceeding 32,768 tokens, we utilize [YaRN](https://arxiv.org/abs/2309.00071), a technique for enhancing model length extrapolation, ensuring optimal performance on lengthy texts.
+
+For supported frameworks, you could add the following to `config.json` to enable YaRN:
+```json
+{
+  ...,
+  "rope_scaling": {
+    "factor": 4.0,
+    "original_max_position_embeddings": 32768,
+    "type": "yarn"
+  }
+}
+```
+
+
+## Streaming Mode
+
+With the help of `TextStreamer`, you can modify your chatting with CodeQwen to streaming mode. Below we show you an example of how to use it:
+
+
+```python
+# Repeat the code above before model.generate()
+# Starting here, we add streamer for text generation.
+from transformers import TextStreamer
+streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+# This will print the output in the streaming mode.
+generated_ids = model.generate(
+    model_inputs.input_ids,
+    max_new_tokens=2048,
+    streamer=streamer,
+)
+```
+
+Besides using `TextStreamer`, we can also use `TextIteratorStreamer` which stores print-ready text in a queue, to be used by a downstream application as an iterator:
+
+```python
+# Repeat the code above before model.generate()
+# Starting here, we add streamer for text generation.
+from transformers import TextIteratorStreamer
+streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+from threading import Thread
+generation_kwargs = dict(inputs=model_inputs.input_ids, streamer=streamer, max_new_tokens=2048)
+thread = Thread(target=model.generate, kwargs=generation_kwargs)
+
+thread.start()
+generated_text = ""
+for new_text in streamer:
+    generated_text += new_text
+    print(new_text, end="")
+```
--- a/examples/Qwen2.5-Coder-Instruct.py
+++ b/examples/Qwen2.5-Coder-Instruct.py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+device = "cuda" # the device to load the model onto
+
+# Now you do not need to add "trust_remote_code=True"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct", device_map="auto").eval()
+
+# tokenize the input into tokens
+
+# Instead of using model.chat(), we directly use model.generate()
+# But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
+prompt = "write a quick sort algorithm."
+messages = [
+    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(device)
+
+# Directly use generate() and tokenizer.decode() to get the output.
+# Use `max_new_tokens` to control the maximum output length.
+generated_ids = model.generate(
+    model_inputs.input_ids,
+    max_new_tokens=2048 # can increase the output length
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+print(response)
--- a/examples/Qwen2.5-Coder-fim.py
+++ b/examples/Qwen2.5-Coder-fim.py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# load model
+device = "cuda" # the device to load the model onto
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
+
+input_text = """<|fim_prefix|>def quicksort(arr):
+    if len(arr) <= 1:
+        return arr
+    pivot = arr[len(arr) // 2]
+    <|fim_suffix|>
+    middle = [x for x in arr if x == pivot]
+    right = [x for x in arr if x > pivot]
+    return quicksort(left) + middle + quicksort(right)<|fim_middle|>"""
+
+model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
+eos_token_ids = [151664, 151662, 151659, 151660, 151661, 151662, 151663, 151664, 151645, 151643]
+
+# Use `max_new_tokens` to control the maximum output length.
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=False, eos_token_id=eos_token_ids)[0]
+# The generated_ids include prompt_ids, we only need to decode the tokens after prompt_ids.
+output_text = tokenizer.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+
+print(f"Prompt: {input_text}\n\nGenerated text: {output_text}")
--- a/examples/Qwen2.5-Coder-repolevel-fim.py
+++ b/examples/Qwen2.5-Coder-repolevel-fim.py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+device = "cuda" # the device to load the model onto
+
+# Now you do not need to add "trust_remote_code=True"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
+
+# tokenize the input into tokens
+# set fim format into the corresponding file you need to infilling
+input_text = """<|repo_name|>library-system
+<|file_sep|>library.py
+class Book:
+    def __init__(self, title, author, isbn, copies):
+        self.title = title
+        self.author = author
+        self.isbn = isbn
+        self.copies = copies
+
+    def __str__(self):
+        return f"Title: {self.title}, Author: {self.author}, ISBN: {self.isbn}, Copies: {self.copies}"
+
+class Library:
+    def __init__(self):
+        self.books = []
+
+    def add_book(self, title, author, isbn, copies):
+        book = Book(title, author, isbn, copies)
+        self.books.append(book)
+
+    def find_book(self, isbn):
+        for book in self.books:
+            if book.isbn == isbn:
+                return book
+        return None
+
+    def list_books(self):
+        return self.books
+
+<|file_sep|>student.py
+class Student:
+    def __init__(self, name, id):
+        self.name = name
+        self.id = id
+        self.borrowed_books = []
+
+    def borrow_book(self, book, library):
+        if book and book.copies > 0:
+            self.borrowed_books.append(book)
+            book.copies -= 1
+            return True
+        return False
+
+    def return_book(self, book, library):
+        if book in self.borrowed_books:
+            self.borrowed_books.remove(book)
+            book.copies += 1
+            return True
+        return False
+
+<|file_sep|>main.py
+<|fim_prefix|>from library import Library
+from student import Student
+
+def main():
+    # Set up the library with some books
+    library = Library()
+    library.add_book("The Great Gatsby", "F. Scott Fitzgerald", "1234567890", 3)
+    library.add_book("To Kill a Mockingbird", "Harper Lee", "1234567891", 2)
+    
+    # Set up a student
+    student = Student("Alice", "S1")
+    
+    # Student borrows a book<|fim_suffix|>
+    if student.borrow_book(book, library):
+        print(f"{student.name} borrowed {book.title}")
+    else:
+        print(f"{student.name} could not borrow {book.title}")
+        
+    # Student returns a book
+    if student.return_book(book, library):
+        print(f"{student.name} returned {book.title}")
+    else:
+        print(f"{student.name} could not return {book.title}")
+    
+    # List all books in the library
+    print("All books in the library:")
+    for book in library.list_books():
+        print(book)
+
+if __name__ == "__main__":
+    main()<|fim_middle|>
+"""
+model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
+
+# Use `max_new_tokens` to control the maximum output length.
+eos_token_ids = [151659, 151661, 151662, 151663, 151664, 151643, 151645]
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=1024, do_sample=False, eos_token_id=eos_token_ids)[0]
+# The generated_ids include prompt_ids, so we only need to decode the tokens after prompt_ids.
+output_text = tokenizer.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+
+print(f"Prompt: \n{input_text}\n\nGenerated text: \n{output_text.split('<|file_sep|>')[0]}")
+
+# the expected output as following:
+"""
+Generated text:
+    book = library.find_book("1234567890")
+"""
--- a/examples/Qwen2.5-Coder-repolevel.py
+++ b/examples/Qwen2.5-Coder-repolevel.py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+device = "cuda" # the device to load the model onto
+
+# Now you do not need to add "trust_remote_code=True"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
+
+# tokenize the input into tokens
+input_text = """<|repo_name|>library-system
+<|file_sep|>library.py
+class Book:
+    def __init__(self, title, author, isbn, copies):
+        self.title = title
+        self.author = author
+        self.isbn = isbn
+        self.copies = copies
+
+    def __str__(self):
+        return f"Title: {self.title}, Author: {self.author}, ISBN: {self.isbn}, Copies: {self.copies}"
+
+class Library:
+    def __init__(self):
+        self.books = []
+
+    def add_book(self, title, author, isbn, copies):
+        book = Book(title, author, isbn, copies)
+        self.books.append(book)
+
+    def find_book(self, isbn):
+        for book in self.books:
+            if book.isbn == isbn:
+                return book
+        return None
+
+    def list_books(self):
+        return self.books
+
+<|file_sep|>student.py
+class Student:
+    def __init__(self, name, id):
+        self.name = name
+        self.id = id
+        self.borrowed_books = []
+
+    def borrow_book(self, book, library):
+        if book and book.copies > 0:
+            self.borrowed_books.append(book)
+            book.copies -= 1
+            return True
+        return False
+
+    def return_book(self, book, library):
+        if book in self.borrowed_books:
+            self.borrowed_books.remove(book)
+            book.copies += 1
+            return True
+        return False
+
+<|file_sep|>main.py
+from library import Library
+from student import Student
+
+def main():
+    # Set up the library with some books
+    library = Library()
+    library.add_book("The Great Gatsby", "F. Scott Fitzgerald", "1234567890", 3)
+    library.add_book("To Kill a Mockingbird", "Harper Lee", "1234567891", 2)
+    
+    # Set up a student
+    student = Student("Alice", "S1")
+    
+    # Student borrows a book
+"""
+model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
+
+# Use `max_new_tokens` to control the maximum output length.
+eos_token_ids = [151664, 151662, 151659, 151660, 151661, 151662, 151663, 151664, 151645, 151643]
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=1024, do_sample=False, eos_token_id=eos_token_ids)[0]
+# The generated_ids include prompt_ids, so we only need to decode the tokens after prompt_ids.
+output_text = tokenizer.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+
+print(f"Prompt: \n{input_text}\n\nGenerated text: \n{output_text.split('<|file_sep|>')[0]}")
+
+# the expected output as following:
+"""
+Generated text:
+    book = library.find_book("1234567890")
+    if student.borrow_book(book, library):
+        print(f"{student.name} borrowed {book.title}")
+    else:
+        print(f"{student.name} could not borrow {book.title}")
+    
+    # Student returns a book
+    if student.return_book(book, library):
+        print(f"{student.name} returned {book.title}")
+    else:
+        print(f"{student.name} could not return {book.title}")
+    
+    # List all books in the library
+    print("All books in the library:")
+    for book in library.list_books():
+        print(book)
+
+if __name__ == "__main__":
+    main()
+
+"""
--- a/examples/Qwen2.5-Coder.md
+++ b/examples/Qwen2.5-Coder.md
+# Use Qwen2.5-Coder-32B By transformers
+One of the simple but fundamental ways to try Qwen2.5-Coder-32B is to use the `transformers` library. In this document, we show how to use Qwen2.5-Coder-32B in three common scenarios of code generation, respectively.
+
+
+## Basic Usage
+The model completes the code snipplets according to the given prompts, without any additional formatting, which is usually termed as `code completion` in the code generation tasks.
+ 
+Essentially, we build the tokenizer and the model with `from_pretrained` method, and we use generate method to perform code completion. Below is an example on how to chat with Qwen2.5-Coder-32B:
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+device = "cuda" # the device to load the model onto
+
+# Now you do not need to add "trust_remote_code=True"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
+
+# tokenize the input into tokens
+input_text = "#write a quick sort algorithm"
+model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
+
+# Use `max_new_tokens` to control the maximum output length.
+eos_token_ids = [151664, 151662, 151659, 151660, 151661, 151662, 151663, 151664, 151645, 151643]
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=False, eos_token_id=eos_token_ids)[0]
+# The generated_ids include prompt_ids, so we only need to decode the tokens after prompt_ids.
+output_text = tokenizer.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+
+print(f"Prompt: {input_text}\n\nGenerated text: {output_text}")
+```
+The `max_new_tokens` argument is used to set the maximum length of the response.
+The `input_text` could be any text that you would like model to continue with.
+
+## Code Insertion (Fill in the middle)
+The code insertion task, also referred to as the "fill-in-the-middle" challenge, requires the insertion of code segments in a manner that bridges the gaps within a given code context. 
+For an approach aligned with best practices, we recommend adhering to the formatting guidelines outlined in the paper "Efficient Training of Language Models to Fill in the Middle"[[arxiv](https://arxiv.org/abs/2207.14255)]. This involves the use of three specialized tokens`<|fim_prefix|>`, `<|fim_suffix|>`, and `<|fim_middle|>` to denote the respective segments of the code structure. 
+The prompt should be structured as follows:
+```python
+prompt = '<|fim_prefix|>' + prefix_code + '<|fim_suffix|>' + suffix_code + '<|fim_middle|>'
+```
+Following the approach mentioned, an example would be structured in this manner:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# load model
+device = "cuda" # the device to load the model onto
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
+
+input_text = """<|fim_prefix|>def quicksort(arr):
+    if len(arr) <= 1:
+        return arr
+    pivot = arr[len(arr) // 2]
+    <|fim_suffix|>
+    middle = [x for x in arr if x == pivot]
+    right = [x for x in arr if x > pivot]
+    return quicksort(left) + middle + quicksort(right)<|fim_middle|>"""
+
+model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
+
+# Use `max_new_tokens` to control the maximum output length.
+eos_token_ids = [151664, 151662, 151659, 151660, 151661, 151662, 151663, 151664, 151645, 151643]
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=False, eos_token_id=eos_token_ids)[0]
+# The generated_ids include prompt_ids, we only need to decode the tokens after prompt_ids.
+output_text = tokenizer.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+
+print(f"Prompt: {input_text}\n\nGenerated text: {output_text}")
+```
+
+## Repository Level Code Completion
+The repository level code completion task involves feeding the model the content of multiple files from the same repository. This enables the model to understand the interrelationships between different calls within these files, thereby facilitating the completion of code content.
+We recommend using the two special tokens `<|repo_name|>` and `<|file_sep|>` to indicate the repository structure.
+For example, assuming the repository name is stored in `repo_name`, and it contains files with their respective paths and contents listed as [(`file_path1`, `file_content1`), (`file_path2`, `file_content2`)], the format of the final input prompt would be as follows:
+```python
+input_text = f'''<|repo_name|>{repo_name}
+<|file_sep|>{file_path1} 
+{file_content1}
+<|file_sep|>{file_path2} 
+{file_content2}'''
+```
+Below is a complete example of a repository level code completion task:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+device = "cuda" # the device to load the model onto
+
+# Now you do not need to add "trust_remote_code=True"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
+
+# tokenize the input into tokens
+input_text = """<repo_name>library-system
+<|file_sep|>library.py
+class Book:
+    def __init__(self, title, author, isbn, copies):
+        self.title = title
+        self.author = author
+        self.isbn = isbn
+        self.copies = copies
+
+    def __str__(self):
+        return f"Title: {self.title}, Author: {self.author}, ISBN: {self.isbn}, Copies: {self.copies}"
+
+class Library:
+    def __init__(self):
+        self.books = []
+
+    def add_book(self, title, author, isbn, copies):
+        book = Book(title, author, isbn, copies)
+        self.books.append(book)
+
+    def find_book(self, isbn):
+        for book in self.books:
+            if book.isbn == isbn:
+                return book
+        return None
+
+    def list_books(self):
+        return self.books
+
+<|file_sep|>student.py
+class Student:
+    def __init__(self, name, id):
+        self.name = name
+        self.id = id
+        self.borrowed_books = []
+
+    def borrow_book(self, book, library):
+        if book and book.copies > 0:
+            self.borrowed_books.append(book)
+            book.copies -= 1
+            return True
+        return False
+
+    def return_book(self, book, library):
+        if book in self.borrowed_books:
+            self.borrowed_books.remove(book)
+            book.copies += 1
+            return True
+        return False
+
+<|file_sep|>main.py
+from library import Library
+from student import Student
+
+def main():
+    # Set up the library with some books
+    library = Library()
+    library.add_book("The Great Gatsby", "F. Scott Fitzgerald", "1234567890", 3)
+    library.add_book("To Kill a Mockingbird", "Harper Lee", "1234567891", 2)
+    
+    # Set up a student
+    student = Student("Alice", "S1")
+    
+    # Student borrows a book
+"""
+model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
+
+# Use `max_new_tokens` to control the maximum output length.
+eos_token_ids = [151664, 151662, 151659, 151660, 151661, 151662, 151663, 151664, 151645, 151643]
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=1024, do_sample=False, eos_token_id=eos_token_ids)[0]
+# The generated_ids include prompt_ids, so we only need to decode the tokens after prompt_ids.
+output_text = tokenizer.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+
+print(f"Prompt: \n{input_text}\n\nGenerated text: \n{output_text.split('<|file_sep|>')[0]}")
+
+```
+The expected output as following:
+```
+Generated text:
+    book = library.find_book("1234567890")
+    if student.borrow_book(book, library):
+    print(f"{student.name} borrowed {book.title}")
+    else:
+    print(f"{student.name} could not borrow {book.title}")
+    
+        # Student returns a book
+        if student.return_book(book, library):
+            print(f"{student.name} returned {book.title}")
+        else:
+            print(f"{student.name} could not return {book.title}")
+        
+        # List all books in the library
+        print("All books in the library:")
+        for book in library.list_books():
+            print(book)
+
+if __name__ == "__main__":
+    main()
+```
+
+## Repository Level Code Infilling
+Repo level code infilling is essentially about concatenating the repo level format with the FIM format, as shown below,
+```python
+input_text = f'''<|repo_name|>{repo_name}
+<|file_sep|>{file_path1} 
+{file_content1}
+<|file_sep|>{file_path2} 
+{file_content2}
+<|file_sep|>{file_path2} 
+<|fim_prefix|>{prefix_code}<|fim_suffix|>{suffix_code}<|fim_middle|>'''
+```
+Below is an example of a repository level code infilling task:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+device = "cuda" # the device to load the model onto
+
+# Now you do not need to add "trust_remote_code=True"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
+
+# tokenize the input into tokens
+# set fim format into the corresponding file you need to infilling
+input_text = """<|repo_name|>library-system
+<|file_sep|>library.py
+class Book:
+    def __init__(self, title, author, isbn, copies):
+        self.title = title
+        self.author = author
+        self.isbn = isbn
+        self.copies = copies
+
+    def __str__(self):
+        return f"Title: {self.title}, Author: {self.author}, ISBN: {self.isbn}, Copies: {self.copies}"
+
+class Library:
+    def __init__(self):
+        self.books = []
+
+    def add_book(self, title, author, isbn, copies):
+        book = Book(title, author, isbn, copies)
+        self.books.append(book)
+
+    def find_book(self, isbn):
+        for book in self.books:
+            if book.isbn == isbn:
+                return book
+        return None
+
+    def list_books(self):
+        return self.books
+
+<|file_sep|>student.py
+class Student:
+    def __init__(self, name, id):
+        self.name = name
+        self.id = id
+        self.borrowed_books = []
+
+    def borrow_book(self, book, library):
+        if book and book.copies > 0:
+            self.borrowed_books.append(book)
+            book.copies -= 1
+            return True
+        return False
+
+    def return_book(self, book, library):
+        if book in self.borrowed_books:
+            self.borrowed_books.remove(book)
+            book.copies += 1
+            return True
+        return False
+
+<|file_sep|>main.py
+<|fim_prefix|>from library import Library
+from student import Student
+
+def main():
+    # Set up the library with some books
+    library = Library()
+    library.add_book("The Great Gatsby", "F. Scott Fitzgerald", "1234567890", 3)
+    library.add_book("To Kill a Mockingbird", "Harper Lee", "1234567891", 2)
+    
+    # Set up a student
+    student = Student("Alice", "S1")
+    
+    # Student borrows a book<|fim_suffix|>
+    if student.borrow_book(book, library):
+        print(f"{student.name} borrowed {book.title}")
+    else:
+        print(f"{student.name} could not borrow {book.title}")
+        
+    # Student returns a book
+    if student.return_book(book, library):
+        print(f"{student.name} returned {book.title}")
+    else:
+        print(f"{student.name} could not return {book.title}")
+    
+    # List all books in the library
+    print("All books in the library:")
+    for book in library.list_books():
+        print(book)
+
+if __name__ == "__main__":
+    main()<|fim_middle|>
+"""
+model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
+
+# Use `max_new_tokens` to control the maximum output length.
+eos_token_ids = [151664, 151662, 151659, 151660, 151661, 151662, 151663, 151664, 151645, 151643]
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=1024, do_sample=False, eos_token_id=eos_token_ids)[0]
+# The generated_ids include prompt_ids, so we only need to decode the tokens after prompt_ids.
+output_text = tokenizer.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+
+print(f"Prompt: \n{input_text}\n\nGenerated text: \n{output_text.split('<|file_sep|>')[0]}")
+
+# the expected output as following:
+"""
+Generated text:
+    book = library.find_book("1234567890")
+"""
+```
+
+# Use Qwen2.5-Coder-32B By vllm
+As a family member of Qwen2.5, Qwen2.5-Coder-32B are supported by vLLM. The detail tutorial  could be found in [Qwen tutorial](https://qwen.readthedocs.io/en/latest/deployment/vllm.html). 
+Here, we only give you an simple example of offline batched inference in vLLM.
+
+## Offline Batched Inference
+
+```python
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+# Initialize the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+
+# Pass the default decoding hyperparameters of Qwen1.5-32B-Chat
+# max_tokens is for the maximum length for generation.
+eos_token_ids = [151664, 151662, 151659, 151660, 151661, 151662, 151663, 151664, 151645, 151643]
+sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=1024, stop_token_ids=eos_token_ids)
+
+# Input the model name or path. Can be GPTQ or AWQ models.
+llm = LLM(model="Qwen/Qwen2.5-Coder-32B")
+
+# Prepare your prompts
+prompt = "#write a quick sort algorithm.\ndef quick_sort("
+
+# generate outputs
+outputs = llm.generate([prompt], sampling_params)
+
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+## Multi-GPU Distributred Serving
+To scale up your serving throughputs, distributed serving helps you by leveraging more GPU devices. 
+When using ultra-long sequences for inference, it might cause insufficient GPU memory. Here, we demonstrate how to run Qwen2.5-Coder-32B with tensor parallelism just by passing in the argument `tensor_parallel_size`
+```python
+llm = LLM(model="Qwen/Qwen2.5-Coder-32B", tensor_parallel_size=8)
+```
+
+## Streaming Mode
+
+With the help of `TextStreamer`, you can modify generation with Qwen2.5-Coder to streaming mode. Below we show you an example of how to use it:
+
+
+```python
+# Repeat the code above before model.generate()
+# Starting here, we add streamer for text generation.
+from transformers import TextStreamer
+streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+# This will print the output in the streaming mode.
+generated_ids = model.generate(
+    model_inputs.input_ids,
+    max_new_tokens=2048,
+    streamer=streamer,
+)
+```
+
+Besides using `TextStreamer`, we can also use `TextIteratorStreamer` which stores print-ready text in a queue, to be used by a downstream application as an iterator:
+
+```python
+# Repeat the code above before model.generate()
+# Starting here, we add streamer for text generation.
+from transformers import TextIteratorStreamer
+streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+from threading import Thread
+generation_kwargs = dict(inputs=model_inputs.input_ids, streamer=streamer, max_new_tokens=2048)
+thread = Thread(target=model.generate, kwargs=generation_kwargs)
+
+thread.start()
+generated_text = ""
+for new_text in streamer:
+    generated_text += new_text
+    print(new_text, end="")
+```
--- a/examples/Qwen2.5-Coder.py
+++ b/examples/Qwen2.5-Coder.py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+device = "cuda" # the device to load the model onto
+
+# Now you do not need to add "trust_remote_code=True"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B", device_map="auto").eval()
+
+
+# tokenize the input into tokens
+input_text = "#write a quick sort algorithm"
+model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
+
+# Use `max_new_tokens` to control the maximum output length.
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=1024, do_sample=False)[0]
+# The generated_ids include prompt_ids, so we only need to decode the tokens after prompt_ids.
+output_text = tokenizer.decode(generated_ids[len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+
+print(f"Prompt: {input_text}\n\nGenerated text: {output_text}")