added baichuan2

7f8094a3 · zhaoying1 · 7f8094a3 · 7f8094a3 · 7f8094a3 · 7f8094a3
Commit 7f8094a3 authored Nov 09, 2023 by zhaoying1
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py37-latest
+COPY requirements.txt requirements.txt
+RUN source /opt/dtk-23.04/env.sh
+RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone 
+ENV LANG C.UTF-8
+RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+RUN pip install accelerate --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# Baichuan 2
+## 论文
+`Baichuan 2: Open Large-scale Language Models`
+https://arxiv.org/abs/2309.10305
+## 模型结构
+Baichuan 2 是百川智能推出的新一代开源大语言模型，采用 2.6 万亿Tokens 的高质量语料训练。
+模型具体参数：
+| 模型名称 | 隐含层维度 | 层数 | 头数 | 词表大小 |  位置编码 | 最大长 |
+| -------- | -------- | -------- | -------- |   -------- | -------- | -------- |
+| Baichuan 2-7B | 4,096 | 32 | 32 | 125,696 |  RoPE | 4096 |
+| Baichuan 2-13B | 5,120 | 40 | 	40 | 125,696 |   ALiBi | 4096 |
+<div align="center">
+<img src="./media/transformer.jpg" width="400" height="300">
+</div>
+## 算法原理
+Baichuan整体模型基于标准的Transformer结构，采用了和LLaMA一样的模型设计。其中，Baichuan-7B在结构上采用Rotary Embedding位置编码方案、SwiGLU激活函数、基于RMSNorm的Pre-Normalization。Baichuan-13B使用了ALiBi线性偏置技术，相对于Rotary Embedding计算量更小，对推理性能有显著提升。
+<div align="center">
+<img src="./media/transformer.png" width="450" height="300">
+</div>
+## 环境配置
+### Docker(方式一)
+推荐使用docker方式运行，提供拉取的docker镜像：
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py37-latest
+```
+安装docker中没有的依赖:
+```
+pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+pip install accelerate --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+pip install datasets peft tokenizers sentencepiece numpy -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+```
+### Dockerfile(方式二)
+```
+docker build -t baichuan2:latest .
+docker run -dit --network=host --name=baichuan2 --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G  --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 baichuan2:latest
+docker exec -it baichuan2 /bin/bash
+```
+### Conda(方式三)
+1. 创建conda虚拟环境：
+```
+conda create -n chatglm python=3.7
+```
+2. 关于本项目DCU显卡所需的工具包、深度学习库等均可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+- [DTK 23.04](https://cancon.hpccube.com:65024/1/main/DTK-23.04.1)
+- [Pytorch 1.13.1](https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04)
+- [Deepspeed 0.9.2](https://cancon.hpccube.com:65024/4/main/deepspeed/dtk23.04)
+    Tips：以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。
+3. 其它依赖库参照requirements.txt安装：
+```
+pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com 
+pip install accelerate --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+```
+## 数据集
+输入数据为放置在项目[fine-tune/data](./fine-tune/data)目录下的 json 文件，`fine-tune/data/belle_chat_ramdon_10k.json`，该样例数据是从 [multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) 采样出 1 万条，并且做了格式转换。主要是展示多轮数据怎么训练，不保证效果。json 文件示例格式如下：
+```
+[
+ {
+  "id": "77771",
+  "conversations": [
+   {
+    "from": "human",
+    "value": "写一篇关于气候变化对海洋生态的影响的文章。\n"
+   },
+   {
+    "from": "gpt",
+    "value": "好的，以下是你要求的文章：\n气候变化对海洋生态的影响\n气候变化对海洋生态产生了广泛的影响。全球变暖导致海洋温度上升，这可能会对许多水生生物的生存、繁殖和迁移造成巨大的影响。另外，海洋酸化也是一个问题，因为大气中二氧化碳的增加导致海水中的pH值下降，使得许多生物的外骨骼变得脆弱，从而影响它们的生存和繁殖。\n此外，气候变化还会引起海平面上升，这不仅对海岸线和沿海居民产生影响，还会对许多海洋生物的栖息地造成影响。一些岛屿上的陆地生物面临几乎完全消失的风险，而另一些生物范围则会扩大到更广泛的领域，这可能会导致新的竞争和生态平衡的改变。\n总之，气候变化并不只是一个地球气温的问题，它对海洋生态系统产生了广泛和深远的影响。\n"
+   },
+   {
+    "from": "human",
+    "value": "从刚刚的文章中提取出气候变化对海洋生物繁殖的影响。\n"
+   },
+   {
+    "from": "gpt",
+    "value": "文章已经提到过气候变化会对海洋生物的生存、繁殖和迁移造成影响。例如，海洋酸化会使得许多生物的外骨骼变得脆弱，进而影响它们的生存和繁殖。此外，海洋温度上升也可能会对很多生物的生存和繁殖产生负面影响。"
+   }
+  ]
+ }
+    ....
+]
+```
+## 模型下载
+Hugging Face模型下载地址：
+[Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)
+[Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)
+[Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat)
+[Baichuan2-13B-Base](https://huggingface.co/baichuan-inc/Baichuan2-13B-Base)
+## 训练
+### 全参数微调训练
+1. 单机训练
+```
+cd fine-tune
+bash ft_train.sh
+```
+2. 多机训练
+```
+cd fine-tune/multi_node
+``` 
+进入节点1，根据环境修改hostfile，保证两节点文件路径一致，配置相同，按需修改run-13b-sft.sh中--mca btl_tcp_if_include enp97s0f1，enp97s0f1改为ip a命令后对应节点ip的网卡名，numa可以根据当前节点拓扑更改绑定，微调命令：
+``` 
+bash run_ft.sh
+``` 
+### LoRA微调训练
+1. 单机训练
+```
+cd fine-tune
+bash run_lora.sh
+```
+2. 多机训练
+```
+cd fine-tune/multi_node
+``` 
+进入节点1，根据环境修改hostfile，保证两节点文件路径一致，配置相同，按需修改run-13b-sft.sh中--mca btl_tcp_if_include enp97s0f1，enp97s0f1改为ip a命令后对应节点ip的网卡名，numa可以根据当前节点拓扑更改绑定，微调命令：
+``` 
+bash run_lora.sh
+``` 
+## 推理
+### 命令行测试
+```bash
+python cli_demo.py
+```
+请根据情况修改其中的模型加载路径。
+## Result
+- 以下为我们基于baichuan2-7b-base模型进行全参数指令微调实验后的推理效果：
+<div align="center">
+<img src="./media/baichuan2-test.png" width="500" height="230">
+</div>
+## 精度
+- 以下为我们基于baichuan2-7b-base模型进行全参数指令微调实验的loss收敛情况：
+<div align="center">
+<img src="./media/baichuan2_7bbase_ft_96c_bs1_acum1_fp16_lr2e-5.jpg" width="300" height="250">
+</div>
+## 应用场景
+### 算法类别
+`对话问答`
+### 热点应用行业
+`医疗,教育,科研,金融`
+## 源码仓库及问题反馈
+- https://developer.hpccube.com/codes/modelzoo/baichuan2_pytorch
+## 参考
+- [https://github.com/baichuan-inc/Baichuan2/tree/main](https://github.com/baichuan-inc/Baichuan2/tree/main)
\ No newline at end of file
--- a/cli_demo.py
+++ b/cli_demo.py
+import os
+import torch
+import platform
+import subprocess
+from colorama import Fore, Style
+from tempfile import NamedTemporaryFile
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+def init_model():
+    print("init model ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        "/public/home/zhaoying1/work/Baichuan2-main/fine-tune/slurm_script/output/checkpoint-420",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    model.generation_config = GenerationConfig.from_pretrained(
+        "/public/home/zhaoying1/work/Baichuan2-main/fine-tune/slurm_script/output/checkpoint-420"
+    )
+    print(model.generation_config)
+    tokenizer = AutoTokenizer.from_pretrained(
+        "/public/home/zhaoying1/work/Baichuan2-main/fine-tune/slurm_script/output/checkpoint-420",
+        use_fast=False,
+        trust_remote_code=True
+    )
+    return model, tokenizer
+def clear_screen():
+    if platform.system() == "Windows":
+        os.system("cls")
+    else:
+        os.system("clear")
+    print(Fore.YELLOW + Style.BRIGHT + "欢迎使用百川大模型，输入进行对话，vim 多行输入，clear 清空历史，CTRL+C 中断生成，stream 开关流式生成，exit 结束。")
+    return []
+def vim_input():
+    with NamedTemporaryFile() as tempfile:
+        tempfile.close()
+        subprocess.call(['vim', '+star', tempfile.name])
+        text = open(tempfile.name).read()
+    return text
+def main(stream=True):
+    model, tokenizer = init_model()
+    messages = clear_screen()
+    while True:
+        prompt = input(Fore.GREEN + Style.BRIGHT + "\n用户：" + Style.NORMAL)
+        if prompt.strip() == "exit":
+            break
+        if prompt.strip() == "clear":
+            messages = clear_screen()
+            continue
+        if prompt.strip() == 'vim':
+            prompt = vim_input()
+            print(prompt)
+        print(Fore.CYAN + Style.BRIGHT + "\nBaichuan 2：" + Style.NORMAL, end='')
+        if prompt.strip() == "stream":
+            stream = not stream
+            print(Fore.YELLOW + "({}流式生成)\n".format("开启" if stream else "关闭"), end='')
+            continue
+        messages.append({"role": "user", "content": prompt})
+        if stream:
+            position = 0
+            try:
+                for response in model.chat(tokenizer, messages, stream=True):
+                    print(response[position:], end='', flush=True)
+                    position = len(response)
+                    if torch.backends.mps.is_available():
+                        torch.mps.empty_cache()
+            except KeyboardInterrupt:
+                pass
+            print()
+        else:
+            response = model.chat(tokenizer, messages)
+            print(response)
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
+        messages.append({"role": "assistant", "content": response})
+    print(Style.RESET_ALL)
+if __name__ == "__main__":
+    main()
--- a/fine-tune/data/belle_chat_ramdon_10k.json
+++ b/fine-tune/data/belle_chat_ramdon_10k.json
--- a/fine-tune/ds_config.json
+++ b/fine-tune/ds_config.json
+{
+    "train_micro_batch_size_per_gpu": "auto",
+    "zero_allow_untested_optimizer": true,
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "initial_scale_power": 16, 
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    }, 
+    "zero_force_ds_cpu_optimizer": false,
+    "zero_optimization": {
+      "stage": 3,
+      "offload_param": {
+        "device": "cpu",
+        "pin_memory": true
+    },
+    "offload_optimizer": {
+        "device": "cpu",
+        "pin_memory": true
+    },
+    "stage3_gather_16bit_weights_on_model_save": true,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+    }
+  }
--- a/fine-tune/ds_config_zero2.json
+++ b/fine-tune/ds_config_zero2.json
+{
+    "train_micro_batch_size_per_gpu": "auto",
+    "zero_allow_untested_optimizer": true,
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "initial_scale_power": 16, 
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    }, 
+    "zero_force_ds_cpu_optimizer": false,
+    "zero_optimization": {
+    "stage": 2,
+    "stage3_gather_16bit_weights_on_model_save": true,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+    }
+  }
--- a/fine-tune/fine-tune.py
+++ b/fine-tune/fine-tune.py
+import os
+import math
+import pathlib
+from typing import Optional, Dict
+from dataclasses import dataclass, field
+import json
+import torch
+from torch.utils.data import Dataset
+import transformers
+from transformers.training_args import TrainingArguments
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="baichuan-inc/Baichuan2-7B-Base")
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data."}
+    )
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    use_lora: bool = field(default=False)
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(
+        self,
+        data_path,
+        tokenizer,
+        model_max_length,
+        user_tokens=[195],
+        assistant_tokens=[196],
+    ):
+        super(SupervisedDataset, self).__init__()
+        self.data = json.load(open(data_path))
+        self.tokenizer = tokenizer
+        self.model_max_length = model_max_length
+        self.user_tokens = user_tokens
+        self.assistant_tokens = assistant_tokens
+        self.ignore_index = -100
+        item = self.preprocessing(self.data[0])
+        print("input:", self.tokenizer.decode(item["input_ids"]))
+        labels = []
+        for id_ in item["labels"]:
+            if id_ == -100:
+                continue
+            labels.append(id_)
+        print("label:", self.tokenizer.decode(labels))
+    def __len__(self):
+        return len(self.data)
+    def preprocessing(self, example):
+        input_ids = []
+        labels = []
+        for message in example["conversations"]:
+            from_ = message["from"]
+            value = message["value"]
+            value_ids = self.tokenizer.encode(value)
+            if from_ == "human":
+                input_ids += self.user_tokens + value_ids
+                labels += [self.tokenizer.eos_token_id] + [self.ignore_index] * len(
+                    value_ids
+                )
+                # print("human_input_ids",input_ids)
+                # print("human_input_ids",labels)
+            else:
+                input_ids += self.assistant_tokens + value_ids
+                labels += [self.ignore_index] + value_ids
+                # print("gpt_input_ids",input_ids)
+                # print("gpt_labels",labels)
+        input_ids.append(self.tokenizer.eos_token_id)
+        labels.append(self.tokenizer.eos_token_id)
+        # print("input_ids!!!!",input_ids)
+        # print("labels!!!",labels)
+        input_ids = input_ids[: self.model_max_length]
+        labels = labels[: self.model_max_length]
+        input_ids += [self.tokenizer.pad_token_id] * (
+            self.model_max_length - len(input_ids)
+        )
+        labels += [self.ignore_index] * (self.model_max_length - len(labels))
+        input_ids = torch.LongTensor(input_ids)
+        labels = torch.LongTensor(labels)
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+        return {
+            "input_ids": input_ids,
+            "labels": labels,
+            "attention_mask": attention_mask,
+        }
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        return self.preprocessing(self.data[idx])
+def train():
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments)
+    )
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        trust_remote_code=True,
+        cache_dir=training_args.cache_dir,
+    )
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        use_fast=False,
+        trust_remote_code=True,
+        model_max_length=training_args.model_max_length,
+        cache_dir=training_args.cache_dir,
+    )
+    if training_args.use_lora:
+        from peft import LoraConfig, TaskType, get_peft_model
+        peft_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            target_modules=["W_pack"],
+            inference_mode=False,
+            r=1,
+            lora_alpha=32,
+            lora_dropout=0.1,
+        )
+        model.enable_input_require_grads()
+        model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
+    dataset = SupervisedDataset(
+        data_args.data_path, tokenizer, training_args.model_max_length
+    )
+    trainer = transformers.Trainer(
+        model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer
+    )
+    trainer.train()
+    trainer.save_state()
+    trainer.save_model(output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()
--- a/fine-tune/ft_train.sh
+++ b/fine-tune/ft_train.sh
+hostfile=""
+HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --hostfile=$hostfile fine-tune.py  \
+    --report_to "none" \
+    --data_path "data/belle_chat_ramdon_10k.json" \
+    --model_name_or_path "../baichuan2-7b-base" \
+    --output_dir "output" \
+    --model_max_length 512 \
+    --num_train_epochs 4 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 1 \
+    --save_strategy epoch \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type constant \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.98 \
+    --adam_epsilon 1e-8 \
+    --max_grad_norm 1.0 \
+    --weight_decay 1e-4 \
+    --warmup_ratio 0.0 \
+    --logging_steps 1 \
+    --gradient_checkpointing True \
+    --deepspeed ds_config.json \
+    --fp16
--- a/fine-tune/lora_train.sh
+++ b/fine-tune/lora_train.sh
+hostfile=""
+HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --hostfile=$hostfile fine-tune.py  \
+    --report_to "none" \
+    --data_path "data/belle_chat_ramdon_10k.json" \
+    --model_name_or_path "../baichuan2-7b-base" \
+    --output_dir "output" \
+    --model_max_length 512 \
+    --num_train_epochs 4 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 1 \
+    --save_strategy epoch \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type constant \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.98 \
+    --adam_epsilon 1e-8 \
+    --max_grad_norm 1.0 \
+    --weight_decay 1e-4 \
+    --warmup_ratio 0.0 \
+    --logging_steps 1 \
+    --gradient_checkpointing True \
+    --deepspeed ds_config_zero2.json \
+    --fp16 \
+    --use_lora True
--- a/fine-tune/multi-node/ds_config.json
+++ b/fine-tune/multi-node/ds_config.json
+{
+    "train_micro_batch_size_per_gpu": "auto",
+    "zero_allow_untested_optimizer": true,
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "initial_scale_power": 16, 
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    }, 
+    "zero_force_ds_cpu_optimizer": false,
+    "zero_optimization": {
+      "stage": 3,
+      "offload_param": {
+        "device": "cpu",
+        "pin_memory": true
+    },
+    "offload_optimizer": {
+        "device": "cpu",
+        "pin_memory": true
+    },
+    "stage3_gather_16bit_weights_on_model_save": true,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+    }
+  }
--- a/fine-tune/multi-node/ds_config_zero2.json
+++ b/fine-tune/multi-node/ds_config_zero2.json
+{
+    "train_micro_batch_size_per_gpu": "auto",
+    "zero_allow_untested_optimizer": true,
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "initial_scale_power": 16, 
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    }, 
+    "zero_force_ds_cpu_optimizer": false,
+    "zero_optimization": {
+    "stage": 2,
+    "stage3_gather_16bit_weights_on_model_save": true,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+    }
+  }
--- a/fine-tune/multi-node/env.sh
+++ b/fine-tune/multi-node/env.sh
+#!/bin/bash
+export ROCM_PATH=/opt/dtk-23.04
+export ROCM_SOURCE_DIR=${ROCM_PATH}
+echo $ROCM_PATH
+export HIP_PATH=${ROCM_PATH}/hip
+export AMDGPU_TARGETS="gfx900;gfx906"
+export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${ROCM_PATH}/lib64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/hip/lib:${ROCM_PATH}/llvm/lib:${ROCM_PATH}/opencl/lib/x86_64:$LD_LIBRARY_PATH
+export C_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/hip/include/hip:${ROCM_PATH}/llvm/include:/opencl/include:${ROCM_PATH}/include/rocrand:${ROCM_PATH}/include/hiprand
+export CPLUS_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/hip/include/hip:${ROCM_PATH}/llvm/include:/opencl/include:${ROCM_PATH}/include/rocrand:${ROCM_PATH}/include/hiprand
+export PATH=${ROCM_PATH}/miopen/bin:${ROCM_PATH}/rocblas/bin:${ROCM_PATH}/hipsparse/bin:$PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/miopen/lib:${ROCM_PATH}/rocblas/lib:$LD_LIBRARY_PATH
+export MIOPEN_SYSTEM_DB_PATH=${ROCM_PATH}/miopen/share/miopen/db/
+export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/usr/lib64:$LIBRARY_PATH
+export RCCL_PATH=$ROCM_PATH/rccl
+export NCCL_PATH=$ROCM_PATH/rccl
+export LD_LIBRARY_PATH=$RCCL_PATH/lib:$LD_LIBRARY_PATH
+export MIOPEN_FIND_MODE=3
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_P2P_LEVEL=5
+export NCCL_GDR_FLUSH_DISABLE=1
+export NCCL_NET_GDR_LEVEL=SYS
+export RCCL_NCHANNELS=2
+export NCCL_IB_HCA=mlx5
+export NCCL_SOCKET_IFNAME=ib0 
+export NCCL_DEBUG=INFO
+export MIOPEN_FIND_MODE=3
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_COMPILE_PARALLEL_LEVEL=1
+export NCCL_PLUGIN_P2P=ucx
+export HIP_CLANG_PATH=/opt/dtk-23.04/llvm/bin
+export HSA_PATH=/opt/dtk-23.04/hsa
+export AOMP=/opt/dtk-23.04/llvm
+export LD_LIBRARY_PATH=/opt/dtk-23.04/rccl/lib:/usr/lib64:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/opencl/lib/x86_64:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/dtk-23.04/rccl/lib:/usr/lib64:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/opencl/lib/x86_64:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/dtk-23.04/roctracer/lib:/opt/dtk-23.04/rocthrust/lib:/opt/dtk-23.04/rocsparse/lib:/opt/dtk-23.04/rocsolver/lib:/opt/dtk-23.04/rocrand/lib:/opt/dtk-23.04/rocprofiler/lib:/opt/dtk-23.04/rocprim/lib:/opt/dtk-23.04/dtk-23.04_smi/lib:/opt/dtk-23.04/rocfft/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/rocalution/lib:/opt/dtk-23.04/rccl/lib:/opt/dtk-23.04/opencl/lib:/opt/dtk-23.04/oam/lib:/opt/dtk-23.04/migraphx/lib:/opt/dtk-23.04/miopengemm/lib:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/llvm/lib-debug/src/openmp/libomptarget/plugins/remote/lib:/opt/dtk-23.04/llvm/lib/clang/14.0.0/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/hsa/lib:/opt/dtk-23.04/hipsparse/lib:/opt/dtk-23.04/hipsolver/lib:/opt/dtk-23.04/hiprand/lib:/opt/dtk-23.04/hipfft/lib:/opt/dtk-23.04/hipcub/lib:/opt/dtk-23.04/hipblas-clients/lib:/opt/dtk-23.04/hipblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/mpi/lib:/usr/local/lib/:/usr/local/lib64/:/usr/lib64/
+export PATH=/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/rocblas/bin:/opt/dtk-23.04/hipsparse/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hcc/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/rocblas/bin:/opt/dtk-23.04/hipsparse/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hcc/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/libexec/rocprofiler:/opt/dtk-23.04/libexec/dtk-23.04_smi:/opt/dtk-23.04/rocprofiler/bin:/opt/dtk-23.04/opencl/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/llvm/lib/clang/14.0.0/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/bin:/opt/mpi/bin:/root/anaconda3/bin:/root/anaconda3/condabin:/usr/lib64/qt-3.3/bin:/root/perl5/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/llvm/lib/clang/14.0.0/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/opencl/bin:/opt/dtk-23.04/rocprofiler/bin:/opt/dtk-23.04/libexec/dtk-23.04_smi:/opt/dtk-23.04/libexec/rocprofiler:/opt/rh/devtoolset-7/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin
+export ROCM_ROOT=/opt/dtk-23.04
+export ROCBLAS_TENSILE_LIBPATH=/opt/dtk-23.04/lib/rocblas/library
+export HIP_ROCCLR_HOME=/opt/dtk-23.04/hip
+export HIP_LIB_PATH=/opt/dtk-23.04/hip/lib
+export DEVICE_LIB_PATH=/opt/dtk-23.04/amdgcn/bitcode
\ No newline at end of file
--- a/fine-tune/multi-node/hostfile
+++ b/fine-tune/multi-node/hostfile
+10.0.21.163 slots=8
+10.0.21.116 slots=8
--- a/fine-tune/multi-node/run_ft.sh
+++ b/fine-tune/multi-node/run_ft.sh
+source env.sh
+echo "START TIME: $(date)"
+hostfile=./hostfile
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*8))
+which mpirun
+mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 run_ft_single.sh 8
+echo "END TIME: $(date)"
--- a/fine-tune/multi-node/run_ft_single.sh
+++ b/fine-tune/multi-node/run_ft_single.sh
+#!/bin/bash
+source env.sh
+GPUS=$1
+string=""
+for ((i=0; i<$GPUS; i++)); do
+  string="$string$i,"
+done
+string=${string%","}
+export HIP_VISIBLE_DEVICES=$string
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+APP="python3 ../fine-tune.py \
+    --deepspeed ../ds_config.json \
+    --report_to "none" \
+    --data_path "../data/belle_chat_ramdon_10k.json" \
+    --model_name_or_path "../../baichuan2-7b-base" \
+    --output_dir "output" \
+    --model_max_length 64 \
+    --num_train_epochs 4 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --save_strategy epoch \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type constant \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.98 \
+    --adam_epsilon 1e-8 \
+    --max_grad_norm 1.0 \
+    --weight_decay 1e-4 \
+    --warmup_ratio 0.0 \
+    --logging_steps 1 \
+    --gradient_checkpointing False \
+    --fp16 \
+    --local_rank $lrank "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/fine-tune/multi-node/run_lora.sh
+++ b/fine-tune/multi-node/run_lora.sh
+source env.sh
+echo "START TIME: $(date)"
+hostfile=./hostfile
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*8))
+which mpirun
+mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 run_lora_single.sh 8
+echo "END TIME: $(date)"
--- a/fine-tune/multi-node/run_lora_single.sh
+++ b/fine-tune/multi-node/run_lora_single.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+#!/bin/bash
+source env.sh
+GPUS=$1
+string=""
+for ((i=0; i<$GPUS; i++)); do
+  string="$string$i,"
+done
+string=${string%","}
+export HIP_VISIBLE_DEVICES=$string
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+echo "LRANK===============================$lrank"
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+echo "WORLD_SIZE*************$WORLD_SIZE"
+APP="python3 ../fine-tune.py \
+    --deepspeed ../ds_config_zero2.json \
+    --report_to "none" \
+    --data_path "../data/belle_chat_ramdon_10k.json" \
+    --model_name_or_path "../../baichuan2-7b-base" \
+    --output_dir "output-lora" \
+    --model_max_length 64 \
+    --num_train_epochs 4 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --save_strategy epoch \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type constant \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.98 \
+    --adam_epsilon 1e-8 \
+    --max_grad_norm 1.0 \
+    --weight_decay 1e-4 \
+    --warmup_ratio 0.0 \
+    --logging_steps 1 \
+    --gradient_checkpointing False \
+    --fp16 \
+    --use_lora True \
+    --local_rank $lrank "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/fine-tune/requirements.txt
+++ b/fine-tune/requirements.txt
+numpy
+transformers==4.28.0
+sentencepiece
+tokenizers
+accelerate
--- a/fine-tune/slurm_script/hostfile/46547085
+++ b/fine-tune/slurm_script/hostfile/46547085
+f14r1n19
+f14r2n00
+f14r2n01
+f14r2n02
+f14r2n03
+f14r2n04
+f14r2n05
+f14r2n06
+f14r2n07
+f14r2n08
+f14r2n09
+f14r2n10
+f14r2n11
+f14r2n12
+f14r2n13
+f14r2n14
+f14r2n15
+f14r2n16
+f14r2n17
+f14r2n18
+f14r2n19
+f14r3n00
+f14r3n01
+f14r3n02