Unverified Commit d66e6988 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

Merge pull request #5278 from ver217/sync/npu

[sync] sync npu branch with main
parents 9102d655 14846934
1.12.0-11.3.0
1.13.0-11.6.0
2.0.0-11.7.0
2.1.0-11.8.0
......@@ -22,57 +22,6 @@ on:
delete:
jobs:
prepare_cache:
name: Prepare testmon cache
if: |
github.event_name == 'create' &&
github.event.ref_type == 'branch' &&
github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
fi
env:
MAIN_BRANCH: ${{ github.event.master_branch }}
prepare_cache_for_pr:
name: Prepare testmon cache for PR
if: |
github.event_name == 'pull_request' &&
(github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
cancel-in-progress: true
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
fi
env:
PR_NUMBER: ${{ github.event.number }}
detect:
name: Detect file change
if: |
......@@ -140,8 +89,8 @@ jobs:
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 60
defaults:
run:
......@@ -174,6 +123,7 @@ jobs:
run: |
cd TensorNVMe
cp -p -r ./build /github/home/tensornvme_cache/
cp -p -r ./cmake-build /github/home/tensornvme_cache/
- name: Checkout Colossal-AI
uses: actions/checkout@v2
......@@ -198,31 +148,24 @@ jobs:
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
- name: Restore Testmon Cache
run: |
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
fi
env:
PR_NUMBER: ${{ github.event.number }}
- name: Execute Unit Testing
run: |
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
-m "not largedist" \
--durations=0 \
--ignore tests/test_analyzer \
--ignore tests/test_auto_parallel \
--ignore tests/test_fx \
--ignore tests/test_autochunk \
--ignore tests/test_gptq \
--ignore tests/test_infer_ops \
--ignore tests/test_legacy \
--ignore tests/test_smoothquant \
tests/
env:
DATA: /data/scratch/cifar-10
NCCL_SHM_DISABLE: 1
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
LLAMA_PATH: /data/scratch/llama-tiny
- name: Store Testmon Cache
run: |
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
env:
PR_NUMBER: ${{ github.event.number }}
- name: Collate artifact
env:
PR_NUMBER: ${{ github.event.number }}
......@@ -259,54 +202,3 @@ jobs:
with:
name: report
path: report/
store_cache:
name: Store testmon cache for PR
if: |
github.event_name == 'pull_request' &&
github.event.action == 'closed' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Store testmon cache if possible
if: github.event.pull_request.merged == true
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
fi
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Remove testmon cache
run: |
rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
remove_cache:
name: Remove testmon cache
if: |
github.event_name == 'delete' &&
github.event.ref_type == 'branch' &&
github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --rm
timeout-minutes: 5
defaults:
run:
shell: bash
steps:
- name: Remove testmon cache
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
rm -rf "/github/home/testmon_cache/${BASE}"
......@@ -10,20 +10,22 @@ jobs:
build:
name: Build and Test Colossal-AI
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, 8-gpu]
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 40
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 90
steps:
- name: Check GPU Availability # ensure all GPUs have enough memory
id: check-avai
run: |
avai=true
for i in $(seq 0 7);
ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
endIndex=$(($ngpu-1))
for i in $(seq 0 $endIndex);
do
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -gt "10000" ] && avai=false
[ "$gpu_used" -gt "2000" ] && avai=false
done
echo "GPU is available: $avai"
......@@ -60,9 +62,12 @@ jobs:
- name: Unit Testing
if: steps.check-avai.outputs.avai == 'true'
run: |
PYTHONPATH=$PWD pytest --durations=0 tests
PYTHONPATH=$PWD pytest \
-m "not largedist" \
--durations=0 \
tests/
env:
DATA: /data/scratch/cifar-10
NCCL_SHM_DISABLE: 1
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny
......@@ -71,7 +76,7 @@ jobs:
if: ${{ failure() }}
run: |
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
msg="Scheduled Build and Test failed, please visit $url for details"
echo $msg
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
env:
......
......@@ -56,7 +56,7 @@ jobs:
needs: detect-changed-doc
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm
timeout-minutes: 20
defaults:
......
......@@ -12,7 +12,7 @@ jobs:
name: Test the changed Doc
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm
timeout-minutes: 60
steps:
......
......@@ -45,7 +45,7 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 15
steps:
......
......@@ -77,9 +77,9 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 15
timeout-minutes: 20
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }}
cancel-in-progress: true
......
......@@ -34,8 +34,8 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
timeout-minutes: 15
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
timeout-minutes: 10
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
......
......@@ -25,15 +25,16 @@
</div>
## Latest News
* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
* [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
* [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b)
* [2023/11] [Enhanced MoE Parallelism, Open-source MoE Model Training Can Be 9 Times More Efficient](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific LLM Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
* [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
* [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
* [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
* [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
* [2023/01] [Hardware Savings Up to 46 Times for AIGC and Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02)
## Table of Contents
<ul>
......@@ -52,6 +53,7 @@
<a href="#Parallel-Training-Demo">Parallel Training Demo</a>
<ul>
<li><a href="#LLaMA2">LLaMA 1/2</a></li>
<li><a href="#MoE">MoE</a></li>
<li><a href="#GPT-3">GPT-3</a></li>
<li><a href="#GPT-2">GPT-2</a></li>
<li><a href="#BERT">BERT</a></li>
......@@ -69,8 +71,9 @@
</ul>
</li>
<li>
<a href="#Inference-Energon-AI-Demo">Inference (Energon-AI) Demo</a>
<a href="#Inference">Inference</a>
<ul>
<li><a href="#SwiftInfer">SwiftInfer:Breaks the Length Limit of LLM for Multi-Round Conversations with 46% Acceleration</a></li>
<li><a href="#GPT-3-Inference">GPT-3</a></li>
<li><a href="#OPT-Serving">OPT-175B Online Serving for Text Generation</a></li>
<li><a href="#BLOOM-Inference">176B BLOOM</a></li>
......@@ -120,43 +123,44 @@ distributed training and inference in a few lines.
- Friendly Usage
- Parallelism based on the configuration file
- Inference
- [Energon-AI](https://github.com/hpcaitech/EnergonAI)
<p align="right">(<a href="#top">back to top</a>)</p>
## Colossal-AI in the Real World
### Colossal-LLaMA-2
- One half-day of training using a few hundred dollars yields similar results to mainstream large models, open-source and commercial-free domain-specific LLM solution.
- 7B: One half-day of training using a few hundred dollars yields similar results to mainstream large models, open-source and commercial-free domain-specific LLM solution.
[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
[[blog]](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
[[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base)
[[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-7b-base/summary)
| | Backbone | Tokens Consumed | | MMLU | CMMLU | AGIEval | GAOKAO | CEval |
| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :-----: | :----: | :----: | :------------------------------: |
| | | - | | 5-shot | 5-shot | 5-shot | 0-shot | 5-shot |
| Baichuan-7B | - | 1.2T | | 42.32 (42.30) | 44.53 (44.02) | 38.72 | 36.74 | 42.80 |
| Baichuan-13B-Base | - | 1.4T | | 50.51 (51.60) | 55.73 (55.30) | 47.20 | 51.41 | 53.60 |
| Baichuan2-7B-Base | - | 2.6T | | 46.97 (54.16) | 57.67 (57.07) | 45.76 | 52.60 | 54.00 |
| Baichuan2-13B-Base | - | 2.6T | | 54.84 (59.17) | 62.62 (61.97) | 52.08 | 58.25 | 58.10 |
| ChatGLM-6B | - | 1.0T | | 39.67 (40.63) | 41.17 (-) | 40.10 | 36.53 | 38.90 |
| ChatGLM2-6B | - | 1.4T | | 44.74 (45.46) | 49.40 (-) | 46.36 | 45.49 | 51.70 |
| InternLM-7B | - | 1.6T | | 46.70 (51.00) | 52.00 (-) | 44.77 | 61.64 | 52.80 |
| Qwen-7B | - | 2.2T | | 54.29 (56.70) | 56.03 (58.80) | 52.47 | 56.42 | 59.60 |
| | | | | | | | | |
| Llama-2-7B | - | 2.0T | | 44.47 (45.30) | 32.97 (-) | 32.60 | 25.46 | - |
| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B | 1.0T | | 37.43 | 29.92 | 32.00 | 27.57 | - |
| wenge-research/yayi-7b-llama2 | Llama-2-7B | - | | 38.56 | 31.52 | 30.99 | 25.95 | - |
| ziqingyang/chinese-llama-2-7b | Llama-2-7B | - | | 33.86 | 34.69 | 34.52 | 25.18 | 34.2 |
| TigerResearch/tigerbot-7b-base | Llama-2-7B | 0.3T | | 43.73 | 42.04 | 37.64 | 30.61 | - |
| LinkSoul/Chinese-Llama-2-7b | Llama-2-7B | - | | 48.41 | 38.31 | 38.45 | 27.72 | - |
| FlagAlpha/Atom-7B | Llama-2-7B | 0.1T | | 49.96 | 41.10 | 39.83 | 33.00 | - |
| IDEA-CCNL/Ziya-LLaMA-13B-v1.1 | Llama-13B | 0.11T | | 50.25 | 40.99 | 40.04 | 30.54 | - |
| | | | | | | | | |
| **Colossal-LLaMA-2-7b-base** | Llama-2-7B | **0.0085T** | | 53.06 | 49.89 | 51.48 | 58.82 | 50.2 |
- 13B: Construct refined 13B private model with just $5000 USD.
[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
[[blog]](https://hpc-ai.com/blog/colossal-llama-2-13b)
[[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-13b-base)
[[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-13b-base/summary)
| Model | Backbone | Tokens Consumed | MMLU (5-shot) | CMMLU (5-shot)| AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot) |
| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :--------------: | :-------------: | :-------------: |
| Baichuan-7B | - | 1.2T | 42.32 (42.30) | 44.53 (44.02) | 38.72 | 36.74 | 42.80 |
| Baichuan-13B-Base | - | 1.4T | 50.51 (51.60) | 55.73 (55.30) | 47.20 | 51.41 | 53.60 |
| Baichuan2-7B-Base | - | 2.6T | 46.97 (54.16) | 57.67 (57.07) | 45.76 | 52.60 | 54.00 |
| Baichuan2-13B-Base | - | 2.6T | 54.84 (59.17) | 62.62 (61.97) | 52.08 | 58.25 | 58.10 |
| ChatGLM-6B | - | 1.0T | 39.67 (40.63) | 41.17 (-) | 40.10 | 36.53 | 38.90 |
| ChatGLM2-6B | - | 1.4T | 44.74 (45.46) | 49.40 (-) | 46.36 | 45.49 | 51.70 |
| InternLM-7B | - | 1.6T | 46.70 (51.00) | 52.00 (-) | 44.77 | 61.64 | 52.80 |
| Qwen-7B | - | 2.2T | 54.29 (56.70) | 56.03 (58.80) | 52.47 | 56.42 | 59.60 |
| Llama-2-7B | - | 2.0T | 44.47 (45.30) | 32.97 (-) | 32.60 | 25.46 | - |
| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B | 1.0T | 37.43 | 29.92 | 32.00 | 27.57 | - |
| wenge-research/yayi-7b-llama2 | Llama-2-7B | - | 38.56 | 31.52 | 30.99 | 25.95 | - |
| ziqingyang/chinese-llama-2-7b | Llama-2-7B | - | 33.86 | 34.69 | 34.52 | 25.18 | 34.2 |
| TigerResearch/tigerbot-7b-base | Llama-2-7B | 0.3T | 43.73 | 42.04 | 37.64 | 30.61 | - |
| LinkSoul/Chinese-Llama-2-7b | Llama-2-7B | - | 48.41 | 38.31 | 38.45 | 27.72 | - |
| FlagAlpha/Atom-7B | Llama-2-7B | 0.1T | 49.96 | 41.10 | 39.83 | 33.00 | - |
| IDEA-CCNL/Ziya-LLaMA-13B-v1.1 | Llama-13B | 0.11T | 50.25 | 40.99 | 40.04 | 30.54 | - |
| **Colossal-LLaMA-2-7b-base** | Llama-2-7B | **0.0085T** | 53.06 | 49.89 | 51.48 | 58.82 | 50.2 |
### ColossalChat
......@@ -215,7 +219,7 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]
- [DreamBooth Fine-tuning](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): Personalize your model using just 3-5 images of the desired subject.
<p id="inference" align="center">
<p id="inference-sd" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
</p>
......@@ -267,6 +271,15 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
### MoE
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/MOE_training.png" width=800/>
</p>
- Enhanced MoE parallelism, Open-source MoE model training can be 9 times more efficient
[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/openmoe)
[[blog]](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
### GPT-3
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/GPT3-v5.png" width=700/>
......@@ -336,7 +349,12 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
<p align="right">(<a href="#top">back to top</a>)</p>
## Inference (Energon-AI) Demo
## Inference
<p id="SwiftInfer" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/SwiftInfer.jpg" width=800/>
</p>
- [SwiftInfer](https://github.com/hpcaitech/SwiftInfer): Inference performance improved by 46%, open source solution breaks the length limit of LLM for multi-round conversations
<p id="GPT-3-Inference" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference_GPT-3.jpg" width=800/>
......@@ -361,7 +379,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
## Installation
Requirements:
- PyTorch >= 1.11 (PyTorch 2.x in progress)
- PyTorch >= 1.11 and PyTorch <= 2.1
- Python >= 3.7
- CUDA >= 11.0
- [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)
......@@ -495,11 +513,22 @@ This project is inspired by some related projects (some by our team and some by
To cite this project, you can use the following BibTeX citation.
```
@article{bian2021colossal,
title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
journal={arXiv preprint arXiv:2110.14883},
year={2021}
@inproceedings{10.1145/3605573.3605613,
author = {Li, Shenggui and Liu, Hongxin and Bian, Zhengda and Fang, Jiarui and Huang, Haichen and Liu, Yuliang and Wang, Boxiang and You, Yang},
title = {Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
year = {2023},
isbn = {9798400708435},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3605573.3605613},
doi = {10.1145/3605573.3605613},
abstract = {The success of Transformer models has pushed the deep learning model scale to billions of parameters, but the memory limitation of a single GPU has led to an urgent need for training on multi-GPU clusters. However, the best practice for choosing the optimal parallel strategy is still lacking, as it requires domain expertise in both deep learning and parallel computing. The Colossal-AI system addressed the above challenge by introducing a unified interface to scale your sequential code of model training to distributed environments. It supports parallel training methods such as data, pipeline, tensor, and sequence parallelism and is integrated with heterogeneous training and zero redundancy optimizer. Compared to the baseline system, Colossal-AI can achieve up to 2.76 times training speedup on large-scale models.},
booktitle = {Proceedings of the 52nd International Conference on Parallel Processing},
pages = {766–775},
numpages = {10},
keywords = {datasets, gaze detection, text tagging, neural networks},
location = {Salt Lake City, UT, USA},
series = {ICPP '23}
}
```
......
......@@ -461,17 +461,19 @@ Thanks so much to all of our amazing contributors!
Coati is developed by ColossalAI Team:
- [Fazzie](https://fazzie-key.cool/about/index.html)
- [FrankLeeeee](https://github.com/FrankLeeeee)
- [BlueRum](https://github.com/ht-zhou)
- [ver217](https://github.com/ver217)
- [ofey404](https://github.com/ofey404)
- [Wenhao Chen](https://github.com/CWHer)
- [ver217](https://github.com/ver217) Leading the project while contributing to the main framework.
- [FrankLeeeee](https://github.com/FrankLeeeee) Providing ML infra support and also taking charge of both front-end and back-end development.
- [htzhou](https://github.com/ht-zhou) Contributing to the algorithm and development for RM and PPO training.
- [Fazzie](https://fazzie-key.cool/about/index.html) Contributing to the algorithm and development for SFT.
- [ofey404](https://github.com/ofey404) Contributing to both front-end and back-end development.
- [Wenhao Chen](https://github.com/CWHer) Contributing to subsequent code enhancements and performance improvements.
The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
- [Zangwei Zheng](https://github.com/zhengzangw)
- [Xue Fuzhao](https://github.com/XueFuzhao)
We also appreciate the valuable suggestions provided by [Jian Hu](https://github.com/hijkzzz) regarding the convergence of the PPO algorithm.
## Citations
```bibtex
......
This diff is collapsed.
# Copyright 2023 lm-sys@FastChat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import dataclasses
from enum import Enum, auto
from typing import List
class SeparatorStyle(Enum):
ADD_BOS_EOS_TOKEN = auto()
@dataclasses.dataclass
class Conversation:
system: str
roles: List[str]
messages: List[List[str]]
offset: int
sep_style: SeparatorStyle
seps: List[str]
def clear(self):
self.messages = []
def get_prompt(self, length: int = None):
if length is None:
length = len(self.messages)
if self.sep_style == SeparatorStyle.ADD_BOS_EOS_TOKEN:
ret = self.system
for role, message in self.messages[0:length]:
if message:
ret += role + ": " + self.seps[0] + message + self.seps[1]
else:
ret += role + ": " + self.seps[0]
return ret
else:
raise ValueError(f"Invalid style: {self.sep_style}")
def save_prompt(self):
if self.sep_style == SeparatorStyle.ADD_BOS_EOS_TOKEN:
ret = self.system
for role, message in self.messages:
if message:
ret += role + ": " + self.seps[0] + message + self.seps[1] + "\n"
else:
ret += role + ": " + self.seps[0]
return ret
else:
raise ValueError(f"Invalid style: {self.sep_style}")
def append_message(self, role, message):
self.messages.append([role, message])
def copy(self):
return Conversation(
system=self.system,
roles=self.roles,
messages=[[x, y] for x, y in self.messages],
offset=self.offset,
sep_style=self.sep_style,
seps=self.seps,
)
def dict(self):
return {
"system": self.system,
"roles": self.roles,
"messages": self.messages,
"offset": self.offset,
"seps": self.seps,
}
conv = Conversation(
system="A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
roles=("Human", "Assistant"),
messages=[],
offset=0,
sep_style=SeparatorStyle.ADD_BOS_EOS_TOKEN,
seps=["<s>", "</s>"],
)
default_conversation = conv
......@@ -4,22 +4,29 @@
Splicing multiple pre-tokenized sequence data points
"""
import bisect
import random
import warnings
from copy import deepcopy
from datasets import dataset_dict
from typing import Any, Callable, Dict, Iterable, List, Union, Tuple
from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
from datasets import dataset_dict
from torch.utils.data import ConcatDataset, Dataset, IterableDataset
from transformers.models.llama.tokenization_llama import LlamaTokenizer
from transformers.tokenization_utils import PreTrainedTokenizer
from colossalai.logging import get_dist_logger
from .conversation import Conversation, default_conversation
logger = get_dist_logger()
IGNORE_INDEX = -100
DSType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
def supervised_tokenize(
def supervised_tokenize_pretrain(
data_point: Dict[str, str], tokenizer: LlamaTokenizer, ignore_index: int = None, max_length: int = 4096
) -> Dict[str, Union[int, str, List[int]]]:
"""
......@@ -62,6 +69,121 @@ def supervised_tokenize(
)
def supervised_tokenize_sft(
data_point: Dict[str, str],
tokenizer: LlamaTokenizer,
conversation_template: Conversation = default_conversation,
ignore_index: int = None,
max_length: int = 4096,
) -> Dict[str, Union[int, str, List[int]]]:
"""
A tokenization function to tokenize an original supervised data point as following:
{"messages": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
"""
assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
"Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
"add <bos> and <eos> manually later"
)
assert (
tokenizer.bos_token == conversation_template.seps[0] and tokenizer.eos_token == conversation_template.seps[1]
), "`bos_token` and `eos_token` should be the same with `conversation_template.seps`."
if ignore_index is None:
ignore_index = IGNORE_INDEX
messages = data_point["messages"]
template = deepcopy(conversation_template)
template.messages = []
for mess in messages:
from_str = mess["from"]
if from_str.lower() == "human":
from_str = template.roles[0]
elif from_str.lower() == "assistant":
from_str = template.roles[1]
else:
raise ValueError(f"Unsupported role {from_str.lower()}")
template.append_message(from_str, mess["content"])
if len(template.messages) % 2 != 0:
template.messages = template.messages[0:-1]
# `target_turn_index` is the number of turns which exceeds `max_length - 1` for the first time.
turns = [i for i in range(1, len(messages) // 2 + 1)]
target_turn_index = bisect.bisect_right(
turns,
max_length - 1,
key=lambda x: len(tokenizer([template.get_prompt(2 * x)], add_special_tokens=False)["input_ids"][0]),
)
# The tokenized length for first turn already exceeds `max_length - 1`.
if target_turn_index - 1 < 0:
return dict(
input_ids=None,
labels=None,
inputs_decode=None,
labels_decode=None,
seq_length=None,
seq_category=None,
)
target_turn = turns[target_turn_index - 1]
prompt = template.get_prompt(2 * target_turn)
tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
template.messages = template.messages[0 : 2 * target_turn]
starts = []
ends = []
gpt_bos = False if template.messages[0][0] == template.roles[0] else True
gpt_eos = False if template.messages[0][0] == template.roles[0] else True
for i, token_id in enumerate(tokenized):
if token_id == tokenizer.bos_token_id:
if gpt_bos:
starts.append(i)
gpt_bos = not gpt_bos
elif token_id == tokenizer.eos_token_id:
if gpt_eos:
ends.append(i)
gpt_eos = not gpt_eos
if len(starts) != target_turn or len(ends) != target_turn:
logger.info(
"Please check whether the tokenizer add additional `bos_token` and `eos_token`.\n\nOr the original message contains `bos_token` or `eos_token`."
)
return dict(
input_ids=None,
labels=None,
inputs_decode=None,
labels_decode=None,
seq_length=None,
seq_category=None,
)
tokenized = [tokenizer.bos_token_id] + tokenized
labels = [ignore_index] * len(tokenized)
for start, end in zip(starts, ends):
labels[start + 1 : end + 2] = tokenized[start + 1 : end + 2]
labels_decode = deepcopy(labels)
for i, z in enumerate(labels_decode):
if z == ignore_index:
labels_decode[i] = tokenizer.unk_token_id
# `inputs_decode` and `labels_decode` can be used to check whether the tokenization method is true.
return dict(
input_ids=tokenized,
labels=labels,
inputs_decode=tokenizer.decode(tokenized),
labels_decode=tokenizer.decode(labels_decode),
seq_length=len(tokenized),
seq_category=data_point["category"] if "category" in data_point else "None",
)
class ClosedToConstantLengthSplicedDataset(IterableDataset):
"""
Define an iterable dataset that returns a (close to) constant length data point spliced from multiple
......@@ -169,12 +291,7 @@ class ClosedToConstantLengthSplicedDataset(IterableDataset):
spliced_labels.extend(seq_labels)
# For residual spliced data point at the end of the data set
if self.infinite is False and more_data_points is False and len(spliced_input_ids) > 0:
examples.append(
{
self.input_ids_field: spliced_input_ids,
self.labels_field: spliced_labels
}
)
examples.append({self.input_ids_field: spliced_input_ids, self.labels_field: spliced_labels})
if self.shuffle:
random.shuffle(examples)
for spliced_data_point in examples:
......
# Copyright 2023 The Hugging Face team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
def unwrap(model):
return model.unwrap().module
def neftune_post_forward_hook(module, input, output):
"""
Implements the NEFTune forward pass for the model using forward hooks. Note this works only for torch.nn.Embedding
layers. This method is slightly adapted from the original source code that can be found here:
https://github.com/neelsjain/NEFTune Simply add it to your model as follows:
```python
model = ...
model.embed_tokens.neftune_noise_alpha = 0.1
model.embed_tokens.register_forward_hook(neftune_post_forward_hook)
```
Args:
module (`torch.nn.Module`):
The embedding module where the hook is attached. Note that you need to set `module.neftune_noise_alpha` to
the desired noise alpha value.
input (`torch.Tensor`):
The input tensor to the model.
output (`torch.Tensor`):
The output tensor of the model (i.e. the embeddings).
"""
if module.training:
dims = torch.tensor(output.size(1) * output.size(2))
mag_norm = module.neftune_noise_alpha / torch.sqrt(dims)
output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
return output
def activate_neftune(model, neftune_noise_alpha=0.1):
r"""
Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper:
https://arxiv.org/abs/2310.05914
"""
embeddings = unwrap(model).get_input_embeddings()
embeddings.neftune_noise_alpha = neftune_noise_alpha
hook_handle = embeddings.register_forward_hook(neftune_post_forward_hook)
neftune_hook_handle = hook_handle
return model, neftune_hook_handle
def deactivate_neftune(model, neftune_hook_handle):
"""
Deactivates the neftune method. Make sure to call `_activate_neftune` first.
"""
embeddings = unwrap(model).get_input_embeddings()
neftune_hook_handle.remove()
del embeddings.neftune_noise_alpha
This diff is collapsed.
import argparse
import os
import torch
from colossalai.logging import get_dist_logger
from transformers import AutoTokenizer, AutoModelForCausalLM
logger = get_dist_logger()
def load_model(model_path, device="cuda", **kwargs):
logger.info(
"Please check whether the tokenizer and model weights are properly stored in the same folder."
)
model = AutoModelForCausalLM.from_pretrained(model_path, **kwargs)
model.to(device)
try:
tokenizer = AutoTokenizer.from_pretrained(model_path)
except OSError:
raise ImportError("Tokenizer not found. Please check if the tokenizer exists or the model path is correct.")
return model, tokenizer
@torch.inference_mode()
def generate(args):
model, tokenizer = load_model(model_path=args.model_path, device=args.device)
BASE_INFERENCE_SUFFIX = "\n\n->\n\n"
input_txt = f"{args.input_txt}{BASE_INFERENCE_SUFFIX}"
inputs = tokenizer(args.input_txt, return_tensors='pt').to(args.device)
output = model.generate(**inputs,
max_new_tokens=args.max_new_tokens,
do_sample=args.do_sample,
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
num_return_sequences=1)
response = tokenizer.decode(output.cpu()[0], skip_special_tokens=True)[len(input_txt):]
logger.info(f"Question: {input_txt} \n\n Answer: \n{response}")
return response
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Colossal-LLaMA-2 inference Process.")
parser.add_argument('--model_path', type=str, default="hpcai-tech/Colossal-LLaMA-2-7b-base", help="HF repo name or local path of the model")
parser.add_argument('--device', type=str, default="cuda:0", help="Set the device")
parser.add_argument('--max_new_tokens', type=int, default=512, help=" Set maximum numbers of tokens to generate, ignoring the number of tokens in the prompt")
parser.add_argument('--do_sample', type=bool, default=True, help="Set whether or not to use sampling")
parser.add_argument('--temperature', type=float, default=0.3, help="Set temperature value")
parser.add_argument('--top_k', type=int, default=50, help="Set top_k value for top-k-filtering")
parser.add_argument('--top_p', type=int, default=0.95, help="Set top_p value for generation")
parser.add_argument('--input_txt', type=str, default="明月松间照,", help="The prompt input to the model")
args = parser.parse_args()
generate(args)
\ No newline at end of file
......@@ -11,14 +11,14 @@ import os
import time
from multiprocessing import cpu_count
from colossal_llama2.dataset.spliced_and_tokenized_dataset import (
ClosedToConstantLengthSplicedDataset,
supervised_tokenize_pretrain,
)
from datasets import dataset_dict, load_dataset
from transformers.models.llama.tokenization_llama import LlamaTokenizer
from colossalai.logging import get_dist_logger
from colossal_llama2.dataset.spliced_and_tokenized_dataset import (
supervised_tokenize,
ClosedToConstantLengthSplicedDataset,
)
logger = get_dist_logger()
......@@ -104,7 +104,7 @@ def main():
assert isinstance(dataset, dataset_dict.Dataset)
logger.info(f"Start to process part-{index}/{len(list_dataset)} of all original datasets.")
dataset = dataset.map(
function=supervised_tokenize,
function=supervised_tokenize_pretrain,
fn_kwargs={"tokenizer": tokenizer, "max_length": args.max_length},
keep_in_memory=False,
num_proc=min(len(dataset), cpu_count()),
......@@ -149,5 +149,5 @@ def main():
spliced_dataset.save_to_disk(dataset_path=output_arrow_path, num_proc=min(len(spliced_dataset), cpu_count()))
if __name__ == '__main__':
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Prepare sft dataset for fine-tuning
"""
import argparse
import json
import math
import os
from multiprocessing import cpu_count
from colossal_llama2.dataset.conversation import default_conversation
from colossal_llama2.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
from datasets import dataset_dict, load_dataset
from transformers.models.llama.tokenization_llama import LlamaTokenizer
from colossalai.logging import get_dist_logger
logger = get_dist_logger()
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_input_dirs",
type=str,
required=True,
default=None,
help="Comma(i.e., ',') separated list of all data directories containing `.jsonl` data files.",
)
parser.add_argument(
"--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer"
)
parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory")
parser.add_argument(
"--data_jsonl_output_dir",
type=str,
default="jsonl_output",
help="Output directory of spliced dataset with jsonl format",
)
parser.add_argument(
"--data_arrow_output_dir",
type=str,
default="arrow_output",
help="Output directory of spliced dataset with arrow format",
)
parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence")
parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins")
args = parser.parse_args()
if args.num_spliced_dataset_bins >= 100000:
raise ValueError("Too many spliced divisions, must be smaller than 100000")
assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}"
assert not os.path.exists(
args.data_jsonl_output_dir
), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}"
assert not os.path.exists(
args.data_arrow_output_dir
), f"Find existed arrow data output dir {args.data_arrow_output_dir}"
os.makedirs(args.data_jsonl_output_dir)
os.makedirs(args.data_arrow_output_dir)
# Prepare to all input datasets
input_data_paths = []
input_data_dirs = args.data_input_dirs.split(",")
for ds_dir in input_data_dirs:
ds_dir = os.path.abspath(ds_dir)
assert os.path.exists(ds_dir), f"Not find data dir {ds_dir}"
ds_files = [name for name in os.listdir(ds_dir) if name.endswith(".jsonl")]
ds_paths = [os.path.join(ds_dir, name) for name in ds_files]
input_data_paths.extend(ds_paths)
# Prepare to data splitting.
train_splits = []
split_interval = math.ceil(100 / args.num_spliced_dataset_bins)
for i in range(0, 100, split_interval):
start = i
end = i + split_interval
if end > 100:
end = 100
train_splits.append(f"train[{start}%:{end}%]")
# Prepare to the tokenizer.
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_dir)
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.unk_token
list_dataset = load_dataset(
path="json",
data_files=input_data_paths,
cache_dir=os.path.join(args.data_cache_dir, "raw"),
keep_in_memory=False,
split=train_splits,
num_proc=cpu_count(),
)
for index, dataset in enumerate(list_dataset):
assert isinstance(dataset, dataset_dict.Dataset)
logger.info(f"Start to process part-{index}/{len(list_dataset)} of all original datasets.")
dataset = dataset.map(
function=supervised_tokenize_sft,
fn_kwargs={
"tokenizer": tokenizer,
"conversation_template": default_conversation,
"max_length": args.max_length,
},
keep_in_memory=False,
num_proc=min(len(dataset), cpu_count()),
)
dataset = dataset.filter(lambda data: data["labels"] is not None)
dataset = dataset.sort(column_names=("seq_category", "seq_length"), reverse=False, keep_in_memory=False)
# We don't concatenate data samples here.
spliced_dataset = dataset
# Save each jsonl spliced dataset.
output_index = "0" * (5 - len(str(index))) + str(index)
output_name = f"part-{output_index}"
output_jsonl_path = os.path.join(args.data_jsonl_output_dir, output_name + ".jsonl")
# st = time.time()
with open(file=output_jsonl_path, mode="w", encoding="utf-8") as fp_writer:
spliced_count = 0
for spliced_data_point in spliced_dataset:
if spliced_count % 500 == 0:
logger.info(f"processing {spliced_count} spliced data points for {fp_writer.name}")
spliced_count += 1
fp_writer.write(json.dumps(spliced_data_point, ensure_ascii=False) + "\n")
# Save each arrow spliced dataset
output_arrow_path = os.path.join(args.data_arrow_output_dir, output_name)
logger.info(f"Start to save {output_arrow_path}")
spliced_dataset = load_dataset(
path="json",
data_files=[output_jsonl_path],
cache_dir=os.path.join(args.data_cache_dir, "spliced_and_tokenized"),
keep_in_memory=False,
num_proc=cpu_count(),
split="train",
)
spliced_dataset.save_to_disk(dataset_path=output_arrow_path, num_proc=min(len(spliced_dataset), cpu_count()))
if __name__ == "__main__":
main()
#!/bin/bash
# NCCL IB environment variables
export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_GID_INDEX=3
export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7
export OMP_NUM_THREADS=8
PROJECT_NAME=""
PARENT_SAVE_DIR=""
PARENT_TENSORBOARD_DIR=""
PARENT_CONFIG_FILE=""
PRETRAINED_MODEL_PATH=""
declare -a dataset=(
"PATH TO THE DATASET"
)
TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
TENSORBOARD_DIR="${PARENT_TENSORBOARD_DIR}${FULL_PROJECT_NAME}"
CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30013 train_sft.py \
--pretrained $PRETRAINED_MODEL_PATH \
--dataset ${dataset[@]} \
--plugin "zero2" \
--save_interval 400 \
--save_dir $SAVE_DIR \
--tensorboard_dir $TENSORBOARD_DIR \
--config_file $CONFIG_FILE \
--num_epochs 1 \
--accumulation_steps 8 \
--micro_batch_size 8 \
--lr 5e-5 \
--mixed_precision "bf16" \
--grad_clip 1.0 \
--weight_decay 0.01 \
--warmup_steps 100 \
--use_grad_checkpoint \
--use_flash_attn \
--use_neft \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment