update codes

af59133f · dengjb · eb1405e9 · af59133f · af59133f · af59133f
Commit af59133f authored Jun 12, 2024 by dengjb
13 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/en/_build/
+docs/zh_cn/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.vscode
+.idea
+.DS_Store
+# custom
+*.pkl
+*.pkl.json
+*.log.json
+# Pytorch
+*.pth
+*.py~
+*.sh~
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
-# starcoder2_pytorch
+# StarCoder2
+StarCoder2模型是一系列3B、7B和15B模型，使用来自Stack-v2数据集的3.3 至4.3万亿个代码标记进行训练，包含600多种编程语言。
+## 论文
+`StarCoder 2 and The Stack v2: The Next Generation`<br>
+[StarCoder2](https://arxiv.org/pdf/2402.19173)
+## 模型结构
+StarCoder2的模型结构主要基于StarCoderBase模型架构进行了微小的改动，首先使用RoPE旋转位置编码。其次使用了GQA模块替换了MQA模块。
+<div align=center>
+    <img src="./asserts/model_architecture.png"/>
+</div>
+## 算法原理
+使用GQA模块能够带来更好的速度，使用GQA的head数量不同则会带来速度和性能平衡转换<br>
+使用了RoPE位置旋转编码来替代Embedding编码，使得模型获得更好的外推性。<br>
+<div align=center>
+    <img src="./asserts/model_blocks.png"/>
+</div>
+## 环境配置
+-v 路径、docker_name和imageID根据实际情况修改
-StarCoder2 is a family of code generation models 
+### Docker（方法一）
\ No newline at end of file
+```bash
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
+docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
+cd /your_code_path/starcoder2_pytorch
+pip install -r requirements.txt  -i http://mirrors.huaweicloud.com/repository/pypi/simple
+export HF_ENDPOINT=https://hf-mirror.com
+```
+### Dockerfile（方法二）
+```bash
+cd docker
+docker build --no-cache -t deepseek_coder:latest .
+docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
+cd /your_code_path/starcoder2_pytorch
+pip install -r requirements.txt  -i http://mirrors.huaweicloud.com/repository/pypi/simple
+export HF_ENDPOINT=https://hf-mirror.com
+```
+### Anaconda（方法三）
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+```
+DTK驱动: dtk24.04
+python: python3.10
+torch: 2.1.0
+```
+`Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
+其它非深度学习库安装方式如下：
+```bash
+pip install -r requirements.txt  -i http://mirrors.huaweicloud.com/repository/pypi/simple
+export HF_ENDPOINT=https://hf-mirror.com
+```
+## 数据集
+finetune训练样例数据采用bigcode/the-stack-smol 下的子集/data/rust [下载地址](https://hf-mirror.com/datasets/bigcode/the-stack-smol)<br>
+```angular2html
+├── data                                                                                                  │4       37.0C    59.0W      auto     300.0W     0%         0%        Normal                              
+│   ├── assembly                                                                                          │5       37.0C    62.0W      auto     300.0W     0%         0%        Normal                              
+│   │   └── data.json
+│   ├── rust                                                                                          
+│   │   └── data.json  
+......
+```
+<div align=center>
+    <img src="./asserts/dataset.png"/>
+</div>
+## 训练
+### 单机四卡
+具体参数更改请在train.sh文件中进行,以下为必要参数 <br>
+dataset_name="{数据集地址}" <br>
+model_name="{预训练模型加载地址}" <br>
+```bash
+bash ./train.sh
+```
+## 推理
+基于Huggingface's Transformers进行推理.<br>
+模型下载后 默认需存放至weights文件夹中<br>
+也可自行更改 inference.py文件中的 model_name 参数<br>
+```bash
+HIP_VISIBLE_DEVICES=0 python inference.py
+```
+## Result
+prompt：def print_hello_world():",<br>
+result：
+<div align=center>
+    <img src="./asserts/result.png"/>
+</div>
+### 精度
+暂无
+## 应用场景
+### 算法类别
+代码生成
+### 热点应用行业
+制造,能源,教育
+## 预训练权重
+模型目录结构如下：
+```
+# starcoder2-7b/
+├── config.json                                                                                           
+├── generation_config.json                                                                                
+├── merges.txt                                                                                            
+├── model-00001-of-00003.safetensors                                                                      
+├── model-00002-of-00003.safetensors                                                                      
+├── model-00003-of-00003.safetensors                                                                      
+├── model.safetensors.index.json                                                                          
+├── README.md                                                                                             
+├── special_tokens_map.json                                                                               
+├── tokenizer_config.json                                                                                 
+├── tokenizer.json                                                                                        
+└── vocab.json 
+```
+## 源码仓库及问题反馈
+- https://developer.hpccube.com/codes/modelzoo/deepseek-coder_pytorch
+## 参考资料
+- https://github.com/deepseek-ai/DeepSeek-Coder
+- https://huggingface.co/deepseek-ai
--- a/README_ori.md
+++ b/README_ori.md
+# StarCoder 2
+<p align="center"><a href="https://huggingface.co/bigcode">[🤗 Models & Datasets]</a> | <a href="https://arxiv.org/abs/2402.19173">[Paper]</a></a> 
+</p>
+StarCoder2 is a family of code generation models (3B, 7B, and 15B), trained on 600+ programming languages from [The Stack v2](https://huggingface.co/datasets/bigcode/the-stack-v2) and some natural language text such as Wikipedia, Arxiv, and GitHub issues. The models use Grouped Query Attention, a context window of 16,384 tokens, with sliding window attention of 4,096 tokens. The 3B & 7B models were trained on 3+ trillion tokens, while the 15B was trained on 4+ trillion tokens. For more details check out the [paper](https://drive.google.com/file/d/17iGn3c-sYNiLyRSY-A85QOzgzGnGiVI3/view).
+# Table of Contents
+1. [Quickstart](#quickstart)
+    - [Installation](#installation)
+    - [Model usage and memory footprint](#model-usage-and-memory-footprint)
+    - [Text-generation-inference code](#text-generation-inference)
+2. [Fine-tuning](#fine-tuning)
+    - [Setup](#setup)
+    - [Training](#training)
+3. [Evaluation](#evaluation)
+# Quickstart
+StarCoder2 models are intended for code completion, they are not instruction models and commands like "Write a function that computes the square root." do not work well. 
+## Installation
+First, we have to install all the libraries listed in `requirements.txt`
+```bash
+pip install -r requirements.txt
+# export your HF token, found here: https://huggingface.co/settings/account
+export HF_TOKEN=xxx
+```
+## Model usage and memory footprint
+Here are some examples to load the model and generate code, with the memory footprint of the largest model, `StarCoder2-15B`. Ensure you've installed `transformers` from source (it should be the case if you used `requirements.txt`)
+```bash
+pip install git+https://github.com/huggingface/transformers.git
+```
+### Running the model on CPU/GPU/multi GPU
+* _Using full precision_
+```python
+# pip install git+https://github.com/huggingface/transformers.git # TODO: merge PR to main
+from transformers import AutoModelForCausalLM, AutoTokenizer
+checkpoint = "bigcode/starcoder2-15b"
+device = "cuda" # for GPU usage or "cpu" for CPU usage
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+# to use Multiple GPUs do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
+model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
+inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+* _Using `torch.bfloat16`_
+```python
+# pip install accelerate
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+checkpoint = "bigcode/starcoder2-15b"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+# for fp16 use `torch_dtype=torch.float16` instead
+model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
+inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to("cuda")
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+```bash
+>>> print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
+Memory footprint: 32251.33 MB
+```
+### Quantized Versions through `bitsandbytes`
+* _Using 8-bit precision (int8)_
+```python
+# pip install bitsandbytes accelerate
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+# to use 4bit use `load_in_4bit=True` instead
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+checkpoint = "bigcode/starcoder2-15b_16k"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder2-15b_16k", quantization_config=quantization_config)
+inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to("cuda")
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+```bash
+>>> print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
+# load_in_8bit
+Memory footprint: 16900.18 MB
+# load_in_4bit
+>>> print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
+Memory footprint: 9224.60 MB
+```
+You can also use `pipeline` for the generation:
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+checkpoint = "bigcode/starcoder2-15b"
+model = AutoModelForCausalLM.from_pretrained(checkpoint)
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
+print( pipe("def hello():") )
+```
+## Text-generation-inference: 
+```bash
+docker run -p 8080:80 -v $PWD/data:/data -e HUGGING_FACE_HUB_TOKEN=<YOUR BIGCODE ENABLED TOKEN> -d  ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder2-15b --max-total-tokens 8192
+```
+For more details, see [here](https://github.com/huggingface/text-generation-inference).
+# Fine-tuning
+Here, we showcase how you can fine-tune StarCoder2 models. For more fine-tuning resources you can check [StarCoder's GitHub repository](https://github.com/bigcode-project/starcoder) and [SantaCoder-Finetuning](https://github.com/loubnabnl/santacoder-finetuning).
+## Setup
+Install `pytorch` [see documentation](https://pytorch.org/), for example the following command works with cuda 12.1:
+```bash
+conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
+```
+Install the requirements (this installs `transformers` from source to support the StarCoder2 architecture):
+```bash
+pip install -r requirements.txt
+```
+Before you run any of the scripts make sure you are logged in `wandb` and HuggingFace Hub to push the checkpoints:
+```bash
+wandb login
+huggingface-cli login
+``` 
+Now that everything is done, you can clone the repository and get into the corresponding directory.
+## Training
+To fine-tune efficiently with a low cost, we use [PEFT](https://github.com/huggingface/peft) library for Low-Rank Adaptation (LoRA) training and [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) for 4bit quantization. We also use the `SFTTrainer` from [TRL](https://github.com/huggingface/trl).
+For this example, we will fine-tune StarCoder2-3b on the `Rust` subset of [the-stack-smol](https://huggingface.co/datasets/bigcode/the-stack-smol). This is just for illustration purposes; for a larger and cleaner dataset of Rust code, you can use [The Stack dedup](https://huggingface.co/datasets/bigcode/the-stack-dedup). 
+To launch the training:
+```bash
+accelerate launch finetune.py \
+        --model_id "bigcode/starcoder2-3b" \
+        --dataset_name "bigcode/the-stack-smol" \
+        --subset "data/rust" \
+        --dataset_text_field "content" \
+        --split "train" \
+        --max_seq_length 1024 \
+        --max_steps 10000 \
+        --micro_batch_size 1 \
+        --gradient_accumulation_steps 8 \
+        --learning_rate 2e-5 \
+        --warmup_steps 20 \
+        --num_proc "$(nproc)"
+```
+If you want to fine-tune on other text datasets, you need to change `dataset_text_field` argument to the name of the column containing the code/text you want to train on.
+# Evaluation
+To evaluate StarCoder2 and its derivatives, you can use the [BigCode-Evaluation-Harness](https://github.com/bigcode-project/bigcode-evaluation-harness) for evaluating Code LLMs. You can also check the [BigCode Leaderboard](https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard).
--- a/asserts/dataset.png
+++ b/asserts/dataset.png
--- a/asserts/model_architecture.png
+++ b/asserts/model_architecture.png
--- a/asserts/model_blocks.png
+++ b/asserts/model_blocks.png
--- a/asserts/result.png
+++ b/asserts/result.png
--- a/finetune.py
+++ b/finetune.py
+# Code adapted from https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
+# and https://huggingface.co/blog/gemma-peft
+import argparse
+import multiprocessing
+import os
+import torch
+import transformers
+from accelerate import PartialState
+from datasets import load_dataset
+from peft import LoraConfig
+from transformers import (
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+    logging,
+    set_seed,
+)
+from trl import SFTTrainer
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_id", type=str, default="bigcode/starcoder2-3b")
+    parser.add_argument("--dataset_name", type=str, default="the-stack-smol")
+    parser.add_argument("--subset", type=str, default="data/rust")
+    parser.add_argument("--split", type=str, default="train")
+    parser.add_argument("--dataset_text_field", type=str, default="content")
+    parser.add_argument("--max_seq_length", type=int, default=1024)
+    parser.add_argument("--max_steps", type=int, default=1000)
+    parser.add_argument("--micro_batch_size", type=int, default=1)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=4)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--bf16", type=bool, default=True)
+    parser.add_argument("--attention_dropout", type=float, default=0.1)
+    parser.add_argument("--learning_rate", type=float, default=2e-4)
+    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
+    parser.add_argument("--warmup_steps", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--output_dir", type=str, default="finetune_starcoder2")
+    parser.add_argument("--num_proc", type=int, default=None)
+    parser.add_argument("--push_to_hub", type=bool, default=True)
+    return parser.parse_args()
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+def main(args):
+    # config
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=False,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    lora_config = LoraConfig(
+        r=8,
+        target_modules=[
+            "q_proj",
+            "o_proj",
+            "k_proj",
+            "v_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        task_type="CAUSAL_LM",
+    )
+    # load model and dataset
+    token = os.environ.get("HF_TOKEN", None)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_id,
+        # quantization_config=bnb_config,
+        device_map={"": PartialState().process_index},
+        attention_dropout=args.attention_dropout,
+    )
+    print_trainable_parameters(model)
+    data = load_dataset(
+        args.dataset_name,
+        data_dir=args.subset,
+        split=args.split,
+        token=token,
+        num_proc=args.num_proc if args.num_proc else multiprocessing.cpu_count(),
+    )
+    # setup the trainer
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=data,
+        max_seq_length=args.max_seq_length,
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=args.micro_batch_size,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+            warmup_steps=args.warmup_steps,
+            max_steps=args.max_steps,
+            learning_rate=args.learning_rate,
+            lr_scheduler_type=args.lr_scheduler_type,
+            weight_decay=args.weight_decay,
+            bf16=args.bf16,
+            logging_strategy="steps",
+            logging_steps=10,
+            output_dir=args.output_dir,
+            optim="adamw_hf",
+            seed=args.seed,
+            run_name=f"train-{args.model_id.split('/')[-1]}",
+            report_to="all",
+        ),
+        peft_config=lora_config,
+        dataset_text_field=args.dataset_text_field,
+    )
+    # launch
+    print("Training...")
+    trainer.train()
+    print("Saving the last checkpoint of the model")
+    model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))
+    if args.push_to_hub:
+        trainer.push_to_hub("Upload model")
+    print("Training Done! 💥")
+if __name__ == "__main__":
+    args = get_args()
+    set_seed(args.seed)
+    os.makedirs(args.output_dir, exist_ok=True)
+    logging.set_verbosity_error()
+    main(args)
--- a/inference.py
+++ b/inference.py
+# pip install git+https://github.com/huggingface/transformers.git # TODO: merge PR to main
+from transformers import AutoModelForCausalLM, AutoTokenizer
+checkpoint = "bigcode/starcoder2-7b"
+device = "cuda" # for GPU usage or "cpu" for CPU usage
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
+model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
+inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=650
+# 模型名称
+modelName=starcoder2_pytorch
+# 模型描述
+modelDescription=StarCoder2模型是一系列3B、7B和15B模型，使用来自Stack-v2数据集的3.3 至4.3万亿个代码标记进行训练，包含600多种编程语言
+# 应用场景
+appScenario=推理,训练,代码生成,制造,能源,教育
+# 框架类型
+frameType=pytorch
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+transformers==4.39.3
+trl==0.9.4
+accelerate==0.27.1
+datasets>=2.16.1
+peft==0.8.2
+wandb==0.16.3
+huggingface_hub==0.20.3
\ No newline at end of file
--- a/train.sh
+++ b/train.sh
+export CUDA_VISIBLE_DEVICES=1,2
+nproc=2
+model_name="/home/starcoder2/starcoder2-7b/"
+dataset_name="/home/starcoder2/the-stack-smol/"
+accelerate launch finetune.py \
+        --model_id $model_name \
+        --dataset_name $dataset_name \
+        --subset "data/rust" \
+        --dataset_text_field "content" \
+        --split "train" \
+        --max_seq_length 1024 \
+        --max_steps 10000 \
+        --micro_batch_size 1 \
+        --gradient_accumulation_steps 8 \
+        --learning_rate 2e-5 \
+        --warmup_steps 20 \
+        --num_proc "$(nproc)"