Merge branch 'main' of https://github.com/hpcaitech/ColossalAI

9e768b59 · zhuwenwen · 7bc5a8e3 · 8aed02b9 · 9e768b59 · 9e768b59
Commit 9e768b59 authored Oct 10, 2023 by zhuwenwen
20 changed files
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -4,11 +4,10 @@ on:
  pull_request:
    types: [synchronize, opened, reopened]
    paths:
-      - 'applications/Chat/coati/**'
+      - "applications/Chat/coati/**"
-      - 'applications/Chat/requirements.txt'
+      - "applications/Chat/requirements.txt"
-      - 'applications/Chat/setup.py'
+      - "applications/Chat/setup.py"
-      - 'applications/Chat/examples/**'
+      - "applications/Chat/examples/**"
 jobs:
  tests:
@@ -20,7 +19,7 @@ jobs:
    runs-on: [self-hosted, gpu]
    container:
      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat
+      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
    timeout-minutes: 30
    defaults:
      run:
@@ -29,28 +28,26 @@ jobs:
      - name: Checkout ColossalAI
        uses: actions/checkout@v2
-      - name: Install ColossalAI and ChatGPT
+      - name: Install ChatGPT
        run: |
-          pip install -e .
          cd applications/Chat
          pip install -v .
          pip install -r examples/requirements.txt
      - name: Install Transformers
        run: |
-          cd applications/Chat
+          pip install transformers==4.30.2
-          git clone https://github.com/hpcaitech/transformers
-          cd transformers
-          pip install -v .
      - name: Execute Examples
        run: |
          cd applications/Chat
          rm -rf ~/.cache/colossalai
-          ./examples/test_ci.sh
+          ./tests/test_inference.sh
+          ./tests/test_benchmarks.sh
+          ./tests/test_train.sh
        env:
          NCCL_SHM_DISABLE: 1
          MAX_JOBS: 8
          SFT_DATASET: /data/scratch/github_actions/chat/data.json
-          PROMPT_PATH: /data/scratch/github_actions/chat/prompts_en.jsonl
+          PROMPT_DATASET: /data/scratch/github_actions/chat/prompts_en.jsonl
          PRETRAIN_DATASET: /data/scratch/github_actions/chat/alpaca_data.json
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -30,9 +30,8 @@ jobs:
      - name: Checkout ColossalAI
        uses: actions/checkout@v2
-      - name: Install ColossalAI and ChatGPT
+      - name: Install ChatGPT
        run: |
-          pip install -e .
          cd applications/Chat
          pip install -v .
          pip install -r requirements-test.txt

--- a/.github/workflows/scripts/check_doc_i18n.py
+++ b/.github/workflows/scripts/check_doc_i18n.py
@@ -22,13 +22,13 @@ def compare_dirs(dir1, dir2):
        # If the corresponding item doesn't exist in the second directory, the directories are different
        if not os.path.exists(item_path2):
-            print(f'Found mismatch: {item_path1}, {item_path2}')
+            print(f"Found mismatch: {item_path1}, {item_path2}")
            return False
        # If the corresponding item is a directory, we compare the two directories recursively
        if os.path.isdir(item_path1) and os.path.isdir(item_path2):
            if not compare_dirs(item_path1, item_path2):
-                print(f'Found mismatch: {item_path1}, {item_path2}')
+                print(f"Found mismatch: {item_path1}, {item_path2}")
                return False
        # both are files
@@ -37,16 +37,16 @@ def compare_dirs(dir1, dir2):
        # If the corresponding item is not a file or a directory, the directories are different
        else:
-            print(f'Found mismatch: {item_path1}, {item_path2}')
+            print(f"Found mismatch: {item_path1}, {item_path2}")
            return False
    # If all items are the same, the directories are the same
    return True
-if __name__ == '__main__':
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('-d', '--directory', help="The directory where the multi-language source files are kept.")
+    parser.add_argument("-d", "--directory", help="The directory where the multi-language source files are kept.")
    args = parser.parse_args()
    i18n_folders = os.listdir(args.directory)
@@ -56,7 +56,7 @@ if __name__ == '__main__':
        for i in range(1, len(i18n_folders)):
            dir1 = i18n_folders[0]
            dir2 = i18n_folders[i]
-            print(f'comparing {dir1} vs {dir2}')
+            print(f"comparing {dir1} vs {dir2}")
            match = compare_dirs(i18n_folders[0], i18n_folders[i])
            if not match:

--- a/.github/workflows/scripts/example_checks/check_dispatch_inputs.py
+++ b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py
@@ -4,7 +4,7 @@ import os
 def check_inputs(input_list):
    for path in input_list:
-        real_path = os.path.join('examples', path)
+        real_path = os.path.join("examples", path)
        if not os.path.exists(real_path):
            return False
    return True
@@ -12,16 +12,16 @@ def check_inputs(input_list):
 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fileNameList', type=str, help="List of file names")
+    parser.add_argument("-f", "--fileNameList", type=str, help="List of file names")
    args = parser.parse_args()
    name_list = args.fileNameList.split(",")
    is_correct = check_inputs(name_list)
    if is_correct:
-        print('success')
+        print("success")
    else:
-        print('failure')
+        print("failure")
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/.github/workflows/scripts/example_checks/check_example_weekly.py
+++ b/.github/workflows/scripts/example_checks/check_example_weekly.py
@@ -17,21 +17,21 @@ def show_files(path, all_files):
 def join(input_list, sep=None):
-    return (sep or ' ').join(input_list)
+    return (sep or " ").join(input_list)
 def main():
-    contents = show_files('examples/', [])
+    contents = show_files("examples/", [])
    all_loc = []
    for file_loc in contents:
-        split_loc = file_loc.split('/')
+        split_loc = file_loc.split("/")
        # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
        if len(split_loc) >= 4:
-            re_loc = '/'.join(split_loc[1:3])
+            re_loc = "/".join(split_loc[1:3])
            if re_loc not in all_loc:
                all_loc.append(re_loc)
    print(all_loc)
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/.github/workflows/scripts/example_checks/detect_changed_example.py
+++ b/.github/workflows/scripts/example_checks/detect_changed_example.py
@@ -3,7 +3,7 @@ import argparse
 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files")
+    parser.add_argument("-f", "--fileNameList", type=str, help="The list of changed files")
    args = parser.parse_args()
    name_list = args.fileNameList.split(":")
    folder_need_check = set()
@@ -15,10 +15,10 @@ def main():
        #     - application
        #       - file
        if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
-            folder_need_check.add('/'.join(loc.split("/")[1:3]))
+            folder_need_check.add("/".join(loc.split("/")[1:3]))
    # Output the result using print. Then the shell can get the values.
    print(list(folder_need_check))
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
+++ b/.github/workflows/scripts/generate_leaderboard_and_send_to_lark.py
--- a/.github/workflows/scripts/generate_release_draft.py
+++ b/.github/workflows/scripts/generate_release_draft.py
@@ -7,27 +7,27 @@ import re
 import requests
-COMMIT_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/commits'
+COMMIT_API = "https://api.github.com/repos/hpcaitech/ColossalAI/commits"
-TAGS_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/tags'
+TAGS_API = "https://api.github.com/repos/hpcaitech/ColossalAI/tags"
 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument('--out', type=str, help='output path for the release draft', required=True)
+    parser.add_argument("--out", type=str, help="output path for the release draft", required=True)
-    parser.add_argument('--version', type=str, help='current version to release', required=True)
+    parser.add_argument("--version", type=str, help="current version to release", required=True)
    return parser.parse_args()
 def get_latest_tag_commit(headers=None):
    res = requests.get(url=TAGS_API, headers=headers)
    data = res.json()
-    commit_hash = data[0]['commit']['sha']
+    commit_hash = data[0]["commit"]["sha"]
-    version = data[0]['name']
+    version = data[0]["name"]
    return commit_hash, version
 def get_commit_info(commit_hash, headers=None):
-    api = f'{COMMIT_API}/{commit_hash}'
+    api = f"{COMMIT_API}/{commit_hash}"
    res = requests.get(url=api, headers=headers)
    return res.json()
@@ -37,7 +37,7 @@ def get_all_commit_info(since, headers=None):
    results = []
    while True:
-        api = f'{COMMIT_API}?since={since}&per_page=100&page={page}'
+        api = f"{COMMIT_API}?since={since}&per_page=100&page={page}"
        resp = requests.get(url=api, headers=headers)
        data = resp.json()
@@ -53,21 +53,21 @@ def get_all_commit_info(since, headers=None):
 def collate_release_info(commit_info_list):
    results = dict()
-    pattern = pattern = r'\[.*\]'
+    pattern = pattern = r"\[.*\]"
    for commit_info in commit_info_list:
-        author = commit_info['commit']['author']['name']
+        author = commit_info["commit"]["author"]["name"]
        try:
-            author_url = commit_info['author']['url']
+            author_url = commit_info["author"]["url"]
        except:
            # author can be None
            author_url = None
-        msg = commit_info['commit']['message']
+        msg = commit_info["commit"]["message"]
        match = re.search(pattern, msg)
        if match:
-            tag = match.group().lstrip('[').rstrip(']').capitalize()
+            tag = match.group().lstrip("[").rstrip("]").capitalize()
            if tag not in results:
                results[tag] = []
            results[tag].append((msg, author, author_url))
@@ -89,42 +89,43 @@ def generate_release_post_markdown(current_version, last_version, release_info):
        for msg, author, author_url in v:
            # only keep the first line
-            msg = msg.split('\n')[0]
+            msg = msg.split("\n")[0]
            if author_url:
-                item = f'{msg} by [{author}]({author_url})\n'
+                item = f"{msg} by [{author}]({author_url})\n"
            else:
-                item = f'{msg} by {author}\n'
+                item = f"{msg} by {author}\n"
-            text.append(f'- {item}')
+            text.append(f"- {item}")
-        text.append('\n')
+        text.append("\n")
    # add full change log
    text.append(
-        f'**Full Changelog**: https://github.com/hpcaitech/ColossalAI/compare/{current_version}...{last_version}')
+        f"**Full Changelog**: https://github.com/hpcaitech/ColossalAI/compare/{current_version}...{last_version}"
+    )
    return text
-if __name__ == '__main__':
+if __name__ == "__main__":
    args = parse_args()
-    token = os.environ['GITHUB_API_TOKEN']
+    token = os.environ["GITHUB_API_TOKEN"]
-    headers = {'Authorization': token}
+    headers = {"Authorization": token}
    # get previous release tag
    last_release_commit, last_version = get_latest_tag_commit(headers)
    last_release_commit_info = get_commit_info(last_release_commit, headers=headers)
-    last_release_date = last_release_commit_info['commit']['author']['date']
+    last_release_date = last_release_commit_info["commit"]["author"]["date"]
    # get the commits since last release
    commit_info = get_all_commit_info(since=last_release_date, headers=headers)
-    commit_info = commit_info[:-1]    # remove the release commit
+    commit_info = commit_info[:-1]  # remove the release commit
    # collate into markdown
    release_info = collate_release_info(commit_info)
    markdown_text = generate_release_post_markdown(args.version, last_version, release_info)
    # write into a file
-    with open(args.out, 'w') as f:
+    with open(args.out, "w") as f:
        for line in markdown_text:
            f.write(line)
--- a/.github/workflows/scripts/send_message_to_lark.py
+++ b/.github/workflows/scripts/send_message_to_lark.py
@@ -5,8 +5,8 @@ import requests
 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument('-m', '--message', type=str)
+    parser.add_argument("-m", "--message", type=str)
-    parser.add_argument('-u', '--url', type=str)
+    parser.add_argument("-u", "--url", type=str)
    return parser.parse_args()
@@ -15,6 +15,6 @@ def send_message_to_lark(message, webhook_url):
    requests.post(webhook_url, json=data)
-if __name__ == '__main__':
+if __name__ == "__main__":
    args = parse_args()
    send_message_to_lark(args.message, args.url)
--- a/.gitignore
+++ b/.gitignore
@@ -155,3 +155,7 @@ colossalai/version.py
 # ignore coverage test file
 coverage.lcov
 coverage.xml
+# ignore testmon and coverage files
+.coverage
+.testmondata*
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -3,3 +3,5 @@ line_length = 120
 multi_line_output=3
 include_trailing_comma = true
 ignore_comments = true
+profile = black
+honor_noqa = true
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
 repos:
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.2.1
+    hooks:
+      - id: autoflake
+        name: autoflake (python)
+        args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
  - repo: https://github.com/pycqa/isort
    rev: 5.12.0
    hooks:
      - id: isort
        name: sort all imports (python)
-  - repo: https://github.com/pre-commit/mirrors-yapf
+  - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: v0.32.0
+    rev: 23.9.1
    hooks:
-    - id: yapf
+    - id: black
-      name: yapf formatter
+      name: black formatter
-      args: ['--style=.style.yapf', '--parallel', '--in-place']
+      args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v13.0.1
    hooks:
    - id: clang-format
      name: clang formatter
+      types_or: [c++, c]
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.3.0

--- a/.style.yapf
+++ b/.style.yapf
-[style]
-based_on_style = google
-spaces_before_comment = 4
-split_before_logical_operator = true
-column_limit = 120
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -30,6 +30,12 @@ pip install <options> -e .
 ### Unit Tests
 We use [PyTest](https://docs.pytest.org/en/latest/) to execute tests. You can install pytest by `pip install pytest`. As some of the tests require initialization of the distributed backend, GPUs are needed to execute these tests.
+To set up the environment for unit testing, first change your current directory to the root directory of your local ColossalAI repository, then run
+```bash
+pip install -r requirements/requirements-test.txt
+```
+If you encounter an error telling "Could not find a version that satisfies the requirement fbgemm-gpu==0.2.0", please downgrade your python version to 3.8 or 3.9 and try again.
 If you only want to run CPU tests, you can run
 ```bash
@@ -138,4 +144,4 @@ You can now create a pull request on the GitHub webpage of your repository. The
 Do write clearly the description of your pull request and [link the pull request to your target issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). This will automatically close the issue when the pull request is approved.
 In case of code conflict, you should rebase your branch and resolve the conflicts manually.
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
@@ -396,3 +396,84 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   POSSIBILITY OF SUCH DAMAGE.
+   ---------------- LICENSE FOR VLLM TEAM ----------------
+   from VLLM TEAM:
+      Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+         https://github.com/vllm-project/vllm/blob/main/LICENSE
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   ---------------- LICENSE FOR LIGHTLLM TEAM ----------------
+   from LIGHTLLM TEAM:
+      Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+         https://github.com/ModelTC/lightllm/blob/main/LICENSE
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   ---------------- LICENSE FOR AutoGPTQ ----------------
+   From AutoGPTQ:
+   MIT License
+   Copyright (c) 2023 潘其威(William)
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   ---------------- LICENSE FOR exllama ----------------
+   From exllama:
+   MIT License
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
   [![Documentation](https://readthedocs.org/projects/colossalai/badge/?version=latest)](https://colossalai.readthedocs.io/en/latest/?badge=latest)
   [![CodeFactor](https://www.codefactor.io/repository/github/hpcaitech/colossalai/badge)](https://www.codefactor.io/repository/github/hpcaitech/colossalai)
   [![HuggingFace badge](https://img.shields.io/badge/%F0%9F%A4%97HuggingFace-Join-yellow)](https://huggingface.co/hpcai-tech)
-   [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&amp)](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w)
+   [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&amp)](https://github.com/hpcaitech/public_assets/tree/main/colossalai/contact/slack)
   [![WeChat badge](https://img.shields.io/badge/微信-加入-green?logo=wechat&amp)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png)
@@ -25,14 +25,15 @@
 </div>
 ## Latest News
+* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
+* [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
+* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
+* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
 * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 * [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
 * [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
 * [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
 * [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02)
-* [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
-* [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
-* [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
 ## Table of Contents
 <ul>
@@ -41,6 +42,7 @@
 <li>
   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
   <ul>
+     <li><a href="#Colossal-LLaMA-2">Colossal-LLaMA-2: One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution</a></li>
     <li><a href="#ColossalChat">ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline</a></li>
     <li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
     <li><a href="#Biomedicine">Biomedicine: Acceleration of AlphaFold Protein Structure</a></li>
@@ -49,6 +51,7 @@
 <li>
   <a href="#Parallel-Training-Demo">Parallel Training Demo</a>
   <ul>
+     <li><a href="#LLaMA2">LLaMA 1/2</a></li>
     <li><a href="#GPT-3">GPT-3</a></li>
     <li><a href="#GPT-2">GPT-2</a></li>
     <li><a href="#BERT">BERT</a></li>
@@ -124,15 +127,55 @@ distributed training and inference in a few lines.
 ## Colossal-AI in the Real World
+### Colossal-LLaMA-2
+- One half-day of training using a few hundred dollars yields similar results to mainstream large models, open-source and commercial-free domain-specific LLM solution.
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
+[[blog]](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
+[[model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base)
+|                                |  Backbone  | Tokens Consumed |  |         MMLU         |     CMMLU     | AGIEval | GAOKAO | CEval  |
+| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :-----: | :----: | :----: | :------------------------------: |
+|                                |           |        -        |                |        5-shot        |    5-shot     | 5-shot  | 0-shot | 5-shot |
+|          Baichuan-7B           |     -      |      1.2T       |             |    42.32 (42.30)     | 44.53 (44.02) |  38.72  | 36.74  | 42.80  |
+|       Baichuan-13B-Base        |     -      |      1.4T       |             |    50.51 (51.60)     | 55.73 (55.30) |  47.20  | 51.41  | 53.60  |
+|       Baichuan2-7B-Base        |     -      |      2.6T       |             |    46.97 (54.16)     | 57.67 (57.07) |  45.76  | 52.60  | 54.00  |
+|       Baichuan2-13B-Base       |     -      |      2.6T       |             |    54.84 (59.17)     | 62.62 (61.97) |  52.08  | 58.25  | 58.10  |
+|           ChatGLM-6B           |     -      |      1.0T       |             |    39.67 (40.63)     |   41.17 (-)   |  40.10  | 36.53  | 38.90  |
+|          ChatGLM2-6B           |     -      |      1.4T       |             |    44.74 (45.46)     |   49.40 (-)   |  46.36  | 45.49  | 51.70  |
+|          InternLM-7B           |     -      |      1.6T       |                |    46.70 (51.00)     |   52.00 (-)   |  44.77  | 61.64  | 52.80  |
+|            Qwen-7B             |     -      |      2.2T       |             | 54.29 (56.70) | 56.03 (58.80) |  52.47  | 56.42  | 59.60  |
+|                                |            |                 |                 |                      |               |         |        |        |
+|           Llama-2-7B           |     -      |      2.0T       |             |    44.47 (45.30)     |   32.97 (-)   |  32.60  | 25.46  |   -    |
+| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |             |        37.43         |     29.92     |  32.00  | 27.57  |   -    |
+| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |                |        38.56         |     31.52     |  30.99  | 25.95  |   -    |
+| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |                |        33.86         |     34.69     |  34.52  | 25.18  |  34.2  |
+| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |             |        43.73         |     42.04     |  37.64  | 30.61  |   -    |
+|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |                |        48.41         |     38.31     |  38.45  | 27.72  |   -    |
+|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |             |        49.96         |     41.10     |  39.83  | 33.00  |   -    |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |            |        50.25         |     40.99     |  40.04  | 30.54  |   -    |
+|  |  |  |  |  |  |  |  |  |
+|    **Colossal-LLaMA-2-7b-base**    | Llama-2-7B |      **0.0085T**      |            |        53.06         |     49.89     |  51.48  | 58.82  |  50.2  |
 ### ColossalChat
 <div align="center">
-   <a href="https://chat.colossalai.org/">
+   <a href="https://www.youtube.com/watch?v=HcTiHzApHm0">
-   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Chat-demo.png" width="700" />
+   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20YouTube.png" width="700" />
   </a>
 </div>
-[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) [[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) [[demo]](https://chat.colossalai.org)
+[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline.
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
+[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
+[[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
+[[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
+<p id="ColossalChat-Speed" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
+</p>
+- Up to 10 times faster for RLHF PPO Stage3 Training
 <p id="ColossalChat_scaling" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>
@@ -205,6 +248,23 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 <p align="right">(<a href="#top">back to top</a>)</p>
 ## Parallel Training Demo
+### LLaMA2
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/llama2_pretraining.png" width=600/>
+</p>
+- 70 billion parameter LLaMA2 model training accelerated by 195%
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama2)
+[[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)
+### LLaMA1
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
+</p>
+- 65-billion-parameter large model pretraining accelerated by 38%
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
+[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
 ### GPT-3
 <p align="center">
@@ -352,6 +412,22 @@ If you want to install and enable CUDA kernel fusion (compulsory installation wh
 CUDA_EXT=1 pip install .
 ```
+For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
+```bash
+# clone the repository
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+# download the cub library
+wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+unzip 1.8.0.zip
+cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+# install
+CUDA_EXT=1 pip install .
+```
 <p align="right">(<a href="#top">back to top</a>)</p>
 ## Use Docker
@@ -426,6 +502,7 @@ To cite this project, you can use the following BibTeX citation.
 }
 ```
-Colossal-AI has been accepted as official tutorial by top conferences [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), etc.
+Colossal-AI has been accepted as official tutorial by top conferences [NeurIPS](https://nips.cc/), [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/),
+[PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), [NVIDIA GTC](https://www.nvidia.com/en-us/on-demand/session/gtcspring23-S51482/) ,etc.
 <p align="right">(<a href="#top">back to top</a>)</p>
--- a/applications/Chat/.gitignore
+++ b/applications/Chat/.gitignore
@@ -145,4 +145,4 @@ docs/.build
 # wandb log
 example/wandb/
 examples/awesome-chatgpt-prompts/
\ No newline at end of file
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -4,7 +4,6 @@
  <span>ColossalChat</span>
 </h1>
 ## Table of Contents
 - [Table of Contents](#table-of-contents)
@@ -34,7 +33,9 @@
 - [Authors](#authors)
 - [Citations](#citations)
 - [Licenses](#licenses)
 ---
 ## What is ColossalChat and Coati ?
 [ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) is the project to implement LLM with RLHF, powered by the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) project.
@@ -42,6 +43,7 @@
 Coati stands for `ColossalAI Talking Intelligence`. It is the name for the module implemented in this project and is also the name of the large language model developed by the ColossalChat project.
 The Coati package provides a unified large language model framework that has implemented the following functions
 - Supports comprehensive large-model training acceleration capabilities for ColossalAI, without requiring knowledge of complex distributed training algorithms
 - Supervised datasets collection
 - Supervised instructions fine-tuning
@@ -56,29 +58,42 @@ The Coati package provides a unified large language model framework that has imp
    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/chatgpt.png" width=700/>
  </p>
-   Image source: https://openai.com/blog/chatgpt
+Image source: https://openai.com/blog/chatgpt
 </div>
 **As Colossal-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
 More details can be found in the latest news.
-* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
-* [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
+- [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
+- [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
 ## Online demo
-You can experience the performance of Coati7B on this page.
-[chat.colossalai.org](https://chat.colossalai.org/)
+<div align="center">
+   <a href="https://www.youtube.com/watch?v=HcTiHzApHm0">
+   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20YouTube.png" width="700" />
+   </a>
+</div>
+[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline.
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
+[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
+[[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
+[[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
-Due to resource constraints, we will only provide this service from 29th Mar 2023 to 5 April 2023. However, we have provided the inference code in the [inference](./inference/) folder. The WebUI will be open-sourced soon as well.
+<p id="ColossalChat-Speed" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
+</p>
+> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: `torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --num_collect_steps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32`
-> Warning: Due to model and dataset size limitations, Coati is just a baby model, Coati7B may output incorrect information and lack the ability for multi-turn dialogue. There is still significant room for improvement.
 ## Install
 ### Install the environment
-```shell
+```bash
 conda create -n coati
 conda activate coati
 git clone https://github.com/hpcaitech/ColossalAI.git
@@ -87,22 +102,20 @@ pip install .
 ```
 ### Install the Transformers
-Given Hugging Face hasn't officially supported the LLaMA models, We fork a branch of Transformers that can be compatible with our code
-```shell
+```bash
-git clone https://github.com/hpcaitech/transformers
+pip install transformers==4.30.2
-cd transformers
-pip install .
 ```
 ## How to use?
 ### Supervised datasets collection
-we collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
+We collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
-[InstructionWild](https://github.com/XueFuzhao/InstructionWild)
+[InstructionWild](https://github.com/XueFuzhao/InstructionWild) and in this [file](https://github.com/XueFuzhao/InstructionWild/blob/main/data/README.md).
 Here is how we collected the data
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/data-collect.png" width=500/>
 </p>
@@ -112,12 +125,28 @@ Here is how we collected the data
 Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.
 You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
+[[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
+**Note**: the supervised dataset follows the following format,
+```json
+[
+    {
+        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
+        "input": "",
+        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
+        "id": 0
+    },
+    ...
+]
+```
 ### RLHF Training Stage2 - Training reward model
 Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model
 You can run the `examples/train_rm.sh` to start a reward model training.
+[[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
 ### RLHF Training Stage3 - Training model with reinforcement learning by human feedback
@@ -128,6 +157,39 @@ Stage3 uses reinforcement learning algorithm, which is the most complex part of
 </p>
 You can run the `examples/train_prompts.sh` to start training PPO with human feedback.
+[[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
+**Note**: the required datasets follow the following format,
+- `pretrain dataset`
+  ```json
+  [
+      {
+          "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
+          "input": "",
+          "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
+          "id": 0
+      },
+      ...
+  ]
+  ```
+- `prompt dataset`
+  ```json
+  [
+      {
+          "instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
+          "id": 0
+      },
+      {
+          "instruction": "Write a descriptive paragraph about a memorable vacation you went on",
+          "id": 1
+      },
+      ...
+  ]
+  ```
 For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples).
@@ -135,9 +197,9 @@ For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree
 We provide an online inference server and a benchmark. We aim to run inference on single GPU, so quantization is essential when using large models.
-We support 8-bit quantization (RTN), 4-bit quantization (GPTQ), and  FP16 inference. You can
+We support 8-bit quantization (RTN), 4-bit quantization (GPTQ), and FP16 inference.
-Online inference server scripts can help you deploy your own services.
+Online inference server scripts can help you deploy your own services.
 For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/inference).
 ## Coati7B examples
@@ -147,6 +209,7 @@ For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tre
 <details><summary><b>E-mail</b></summary>
 ![phd](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/Phd.png)
 </details>
 <details><summary><b>coding</b></summary>
@@ -180,6 +243,7 @@ For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tre
 </details>
 ### Open QA
 <details><summary><b>Game</b></summary>
 ![Game](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/game.png)
@@ -213,6 +277,7 @@ For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tre
 You can find more examples in this [repo](https://github.com/XueFuzhao/InstructionWild/blob/main/comparison.md).
 ### Limitation
 <details><summary><b>Limitation for LLaMA-finetuned models</b></summary>
 - Both Alpaca and ColossalChat are based on LLaMA. It is hard to compensate for the missing knowledge in the pre-training stage.
 - Lack of counting ability: Cannot count the number of items in a list.
@@ -236,7 +301,7 @@ You can find more examples in this [repo](https://github.com/XueFuzhao/Instructi
 We have integrated the Transformers save and load pipeline, allowing users to freely call Hugging Face's language models and save them in the HF format.
-```
+```python
 from coati.models.llama import LlamaLM
 from coati.trainer import SFTTrainer
@@ -245,20 +310,20 @@ tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
 (model, optim) = strategy.prepare((model, optim))
 trainer = SFTTrainer(model=model,
-    strategy=strategy,
+                     strategy=strategy,
-    optim=optim,
+                     optim=optim,
-    train_dataloader=train_dataloader,
+                     train_dataloader=train_dataloader,
-    eval_dataloader=eval_dataloader,
+                     eval_dataloader=eval_dataloader,
-    batch_size=args.batch_size,
+                     batch_size=args.batch_size,
-    max_epochs=args.max_epochs,
+                     max_epochs=args.max_epochs,
-    accumulation_steps = args.accumulation_steps
+                     accumulation_steps=args.accumulation_steps
-)
+                     )
 trainer.fit()
 # this saves in pytorch format
 strategy.save_model(model, args.save_path, only_rank0=True)
-# this saves in HF format. ColossalAI strategy with stage-3 doesn't support this method
+# this saves in HF format
 strategy.save_pretrained(model, args.save_path, only_rank0=True, tokenizer=tokenizer)
 ```
@@ -269,12 +334,13 @@ strategy.save_pretrained(model, args.save_path, only_rank0=True, tokenizer=token
 Here are some examples that can allow you to train a 7B model on a single or multiple consumer-grade GPUs.
 If you only have a single 24G GPU, you can use the following script. `batch_size`, `lora_rank` and `grad_checkpoint` are the most important parameters to successfully train the model.
-```
+```bash
+// [INFO]: MAX GPU MEMORY ALLOCATED:  19148.9345703125 MB
 torchrun --standalone --nproc_per_node=1 train_sft.py \
    --pretrain "/path/to/LLaMa-7B/" \
    --model 'llama' \
-    --strategy naive \
+    --strategy ddp \
-    --log_interval 10 \
    --save_path  /path/to/Coati-7B \
    --dataset /path/to/data.json \
    --batch_size 1 \
@@ -287,12 +353,12 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
 ```
 `colossalai_gemini` strategy can enable a single 24G GPU to train the whole model without using LoRA if you have sufficient CPU memory. You can use the following script.
-```
+```bash
 torchrun --standalone --nproc_per_node=1 train_sft.py \
    --pretrain "/path/to/LLaMa-7B/" \
    --model 'llama' \
    --strategy colossalai_gemini \
-    --log_interval 10 \
    --save_path  /path/to/Coati-7B \
    --dataset /path/to/data.json \
    --batch_size 1 \
@@ -304,12 +370,12 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
 ```
 If you have 4x32 GB GPUs, you can even train the whole 7B model using our `colossalai_zero2_cpu` strategy! The script is given as follows.
-```
+```bash
 torchrun --standalone --nproc_per_node=4 train_sft.py \
    --pretrain "/path/to/LLaMa-7B/" \
    --model 'llama' \
    --strategy colossalai_zero2_cpu \
-    --log_interval 10 \
    --save_path  /path/to/Coati-7B \
    --dataset /path/to/data.json \
    --batch_size 1 \
@@ -319,8 +385,8 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
    --max_epochs 1 \
    --grad_checkpoint
 ```
-</details>
+</details>
 ## The Plan
@@ -335,31 +401,33 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
 - [ ] support chain-of-thought by [langchain](https://github.com/hwchase17/langchain)
 ### Real-time progress
-You will find our progress in github project broad
-[Coati](https://github.com/orgs/hpcaitech/projects/17/views/1)
+You will find our progress in github [project broad](https://github.com/orgs/hpcaitech/projects/17/views/1).
 ## Invitation to open-source contribution
 Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models from the starting point of replicating ChatGPT!
 You may contact us or participate in the following ways:
 1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
 2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
 3. Join the Colossal-AI community on
-[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
+   [Slack](https://github.com/hpcaitech/public_assets/tree/main/colossalai/contact/slack),
-and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
+   and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
 4. Send your official proposal to email contact@hpcaitech.com
 Thanks so much to all of our amazing contributors!
 ## Quick Preview
 <div align="center">
   <a href="https://chat.colossalai.org/">
   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Chat-demo.png" width="700" />
   </a>
 </div>
- An open-source low cost solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. [[demo]](https://chat.colossalai.org)
+- An open-source low-cost solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. [[demo]](https://chat.colossalai.org)
 <p id="ChatGPT_scaling" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>
@@ -386,18 +454,21 @@ Thanks so much to all of our amazing contributors!
 | Better Cases  |     38 ⚔ **41**      |     **45** ⚔ 33      |
 |   Win Rate    |    48% ⚔ **52%**     |    **58%** ⚔ 42%     |
 | Average Score |   7.06 ⚔ **7.13**    |   **7.31** ⚔ 6.82    |
 - Our Coati-7B model performs better than Alpaca-7B when using GPT-4 to evaluate model performance. The Coati-7B model we evaluate is an old version we trained a few weeks ago and the new version is around the corner.
 ## Authors
 Coati is developed by ColossalAI Team:
 - [Fazzie](https://fazzie-key.cool/about/index.html)
 - [FrankLeeeee](https://github.com/FrankLeeeee)
 - [BlueRum](https://github.com/ht-zhou)
 - [ver217](https://github.com/ver217)
 - [ofey404](https://github.com/ofey404)
+- [Wenhao Chen](https://github.com/CWHer)
-The Phd student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
+The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
 - [Zangwei Zheng](https://github.com/zhengzangw)
 - [Xue Fuzhao](https://github.com/XueFuzhao)

--- a/applications/Chat/benchmarks/README.md
+++ b/applications/Chat/benchmarks/README.md
@@ -27,9 +27,12 @@ We also provide various training strategies:
 We only support `torchrun` to launch now. E.g.
-```shell
+```bash
 # run OPT-125M with no lora (lora_rank=0) on single-node single-GPU with min batch size
-torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py --model 125m --critic_model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
+torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py \
+    --model 125m --critic_model 125m --strategy ddp \
+    --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
 # run Actor (OPT-1.3B) and Critic (OPT-350M) with lora_rank=4 on single-node 4-GPU
-torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py --model 1.3b --critic_model 350m --strategy colossalai_zero2 --lora_rank 4
+torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py \
+    --model 1.3b --critic_model 350m --strategy colossalai_zero2 --lora_rank 4
 ```
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -8,7 +8,7 @@ from coati.models.base import RewardModel
 from coati.models.opt import OPTActor, OPTCritic
 from coati.trainer import PPOTrainer
 from coati.trainer.callbacks import PerformanceEvaluator
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from transformers import AutoTokenizer
@@ -19,7 +19,7 @@ from colossalai.nn.optimizer import HybridAdam
 def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
    numel = sum(p.numel() for p in model.parameters())
-    if isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3 and strategy.shard_init:
+    if isinstance(strategy, GeminiStrategy) and strategy.shard_init:
        numel *= dist.get_world_size()
    return numel
@@ -27,7 +27,7 @@ def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
 def preprocess_batch(samples) -> dict:
    input_ids = torch.stack(samples)
    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
-    return {'input_ids': input_ids, 'attention_mask': attention_mask}
+    return {"input_ids": input_ids, "attention_mask": attention_mask}
 def print_rank_0(*args, **kwargs) -> None:
@@ -39,32 +39,32 @@ def print_model_numel(model_dict: dict) -> None:
    B = 1024**3
    M = 1024**2
    K = 1024
-    outputs = ''
+    outputs = ""
    for name, numel in model_dict.items():
-        outputs += f'{name}: '
+        outputs += f"{name}: "
        if numel >= B:
-            outputs += f'{numel / B:.2f} B\n'
+            outputs += f"{numel / B:.2f} B\n"
        elif numel >= M:
-            outputs += f'{numel / M:.2f} M\n'
+            outputs += f"{numel / M:.2f} M\n"
        elif numel >= K:
-            outputs += f'{numel / K:.2f} K\n'
+            outputs += f"{numel / K:.2f} K\n"
        else:
-            outputs += f'{numel}\n'
+            outputs += f"{numel}\n"
    print_rank_0(outputs)
 def get_gpt_config(model_name: str) -> OPTConfig:
    model_map = {
-        '125m': OPTConfig.from_pretrained('facebook/opt-125m'),
+        "125m": OPTConfig.from_pretrained("facebook/opt-125m"),
-        '350m': OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
+        "350m": OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
-        '700m': OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
+        "700m": OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
-        '1.3b': OPTConfig.from_pretrained('facebook/opt-1.3b'),
+        "1.3b": OPTConfig.from_pretrained("facebook/opt-1.3b"),
-        '2.7b': OPTConfig.from_pretrained('facebook/opt-2.7b'),
+        "2.7b": OPTConfig.from_pretrained("facebook/opt-2.7b"),
-        '3.5b': OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
+        "3.5b": OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
-        '5.5b': OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
+        "5.5b": OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
-        '6.7b': OPTConfig.from_pretrained('facebook/opt-6.7b'),
+        "6.7b": OPTConfig.from_pretrained("facebook/opt-6.7b"),
-        '10b': OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
+        "10b": OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
-        '13b': OPTConfig.from_pretrained('facebook/opt-13b'),
+        "13b": OPTConfig.from_pretrained("facebook/opt-13b"),
    }
    try:
        return model_map[model_name]
@@ -73,20 +73,20 @@ def get_gpt_config(model_name: str) -> OPTConfig:
 def main(args):
-    if args.strategy == 'ddp':
+    if args.strategy == "ddp":
        strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
+    elif args.strategy == "colossalai_gemini":
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5)
-    elif args.strategy == 'colossalai_gemini_cpu':
+    elif args.strategy == "colossalai_gemini_cpu":
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
-    elif args.strategy == 'colossalai_zero2':
+    elif args.strategy == "colossalai_zero2":
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    elif args.strategy == 'colossalai_zero2_cpu':
+    elif args.strategy == "colossalai_zero2_cpu":
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
-    elif args.strategy == 'colossalai_zero1':
+    elif args.strategy == "colossalai_zero1":
-        strategy = ColossalAIStrategy(stage=1, placement_policy='cuda')
+        strategy = LowLevelZeroStrategy(stage=1, placement_policy="cuda")
-    elif args.strategy == 'colossalai_zero1_cpu':
+    elif args.strategy == "colossalai_zero1_cpu":
-        strategy = ColossalAIStrategy(stage=1, placement_policy='cpu')
+        strategy = LowLevelZeroStrategy(stage=1, placement_policy="cpu")
    else:
        raise ValueError(f'Unsupported strategy "{args.strategy}"')
@@ -103,92 +103,106 @@ def main(args):
    if args.use_kernels:
        from coati.kernels import convert_to_xformer_model
-        actor, critic, initial_model, reward_model = map(convert_to_xformer_model,
-                                                         (actor, critic, initial_model, reward_model))
+        actor, critic, initial_model, reward_model = map(
+            convert_to_xformer_model, (actor, critic, initial_model, reward_model)
+        )
    actor_numel = get_model_numel(actor, strategy)
    critic_numel = get_model_numel(critic, strategy)
    initial_model_numel = get_model_numel(initial_model, strategy)
    reward_model_numel = get_model_numel(reward_model, strategy)
-    print_model_numel({
+    print_model_numel(
-        'Actor': actor_numel,
+        {
-        'Critic': critic_numel,
+            "Actor": actor_numel,
-        'Initial model': initial_model_numel,
+            "Critic": critic_numel,
-        'Reward model': reward_model_numel
+            "Initial model": initial_model_numel,
-    })
+            "Reward model": reward_model_numel,
-    performance_evaluator = PerformanceEvaluator(actor_numel,
+        }
-                                                 critic_numel,
+    )
-                                                 initial_model_numel,
+    performance_evaluator = PerformanceEvaluator(
-                                                 reward_model_numel,
+        actor_numel,
-                                                 enable_grad_checkpoint=False,
+        critic_numel,
-                                                 ignore_episodes=1)
+        initial_model_numel,
+        reward_model_numel,
-    if args.strategy.startswith('colossalai'):
+        enable_grad_checkpoint=False,
+        ignore_episodes=1,
+    )
+    if args.strategy.startswith("colossalai"):
        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
    else:
        actor_optim = Adam(actor.parameters(), lr=5e-6)
        critic_optim = Adam(critic.parameters(), lr=5e-6)
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"
    (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
-    trainer = PPOTrainer(strategy,
-                         actor,
-                         critic,
-                         reward_model,
-                         initial_model,
-                         actor_optim,
-                         critic_optim,
-                         ptx_coef=0,
-                         max_epochs=args.max_epochs,
-                         train_batch_size=args.train_batch_size,
-                         offload_inference_models=args.offload_inference_models,
-                         max_length=512,
-                         do_sample=True,
-                         temperature=1.0,
-                         top_k=50,
-                         use_cache=True,
-                         pad_token_id=tokenizer.pad_token_id,
-                         eos_token_id=tokenizer.eos_token_id,
-                         callbacks=[performance_evaluator])
    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 256), device=torch.cuda.current_device())
-    dataloader = DataLoader(random_prompts,
+    dataloader = DataLoader(
-                            batch_size=args.experience_batch_size,
+        random_prompts, batch_size=args.experience_batch_size, shuffle=True, collate_fn=preprocess_batch
-                            shuffle=True,
+    )
-                            collate_fn=preprocess_batch)
+    trainer = PPOTrainer(
-    trainer.fit(dataloader,
+        strategy,
-                None,
+        actor,
-                num_episodes=args.num_episodes,
+        critic,
-                max_timesteps=args.max_timesteps,
+        reward_model,
-                update_timesteps=args.update_timesteps)
+        initial_model,
+        actor_optim,
-    print_rank_0(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB')
+        critic_optim,
+        tokenizer=tokenizer,
+        ptx_coef=0,
-if __name__ == '__main__':
+        train_batch_size=args.train_batch_size,
+        offload_inference_models=args.offload_inference_models,
+        max_length=512,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        use_cache=True,
+        callbacks=[performance_evaluator],
+    )
+    trainer.fit(
+        prompt_dataloader=dataloader,
+        pretrain_dataloader=None,
+        num_episodes=args.num_episodes,
+        num_update_steps=args.num_update_steps,
+        num_collect_steps=args.num_collect_steps,
+    )
+    print_rank_0(f"Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('--model', default='125m')
+    parser.add_argument("--model", default="125m")
-    parser.add_argument('--critic_model', default='125m')
+    parser.add_argument("--critic_model", default="125m")
-    parser.add_argument('--strategy',
+    parser.add_argument(
-                        choices=[
+        "--strategy",
-                            'ddp', 'colossalai_gemini', 'colossalai_gemini_cpu', 'colossalai_zero2',
+        choices=[
-                            'colossalai_zero2_cpu', 'colossalai_zero1', 'colossalai_zero1_cpu'
+            "ddp",
-                        ],
+            "colossalai_gemini",
-                        default='ddp')
+            "colossalai_gemini_cpu",
-    parser.add_argument('--num_episodes', type=int, default=3)
+            "colossalai_zero2",
-    parser.add_argument('--max_timesteps', type=int, default=8)
+            "colossalai_zero2_cpu",
-    parser.add_argument('--update_timesteps', type=int, default=8)
+            "colossalai_zero1",
-    parser.add_argument('--max_epochs', type=int, default=1)
+            "colossalai_zero1_cpu",
-    parser.add_argument('--train_batch_size', type=int, default=8)
+        ],
-    parser.add_argument('--experience_batch_size', type=int, default=8)
+        default="ddp",
-    parser.add_argument('--lora_rank', type=int, default=0)
+    )
-    parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
+    parser.add_argument("--num_episodes", type=int, default=3)
-    parser.add_argument('--offload_inference_models', action='store_true', default=False)
+    parser.add_argument("--num_collect_steps", type=int, default=8)
-    parser.add_argument('--use_kernels', action='store_true', default=False)
+    parser.add_argument("--num_update_steps", type=int, default=1)
+    parser.add_argument("--train_batch_size", type=int, default=8)
+    parser.add_argument("--experience_batch_size", type=int, default=8)
+    parser.add_argument("--lora_rank", type=int, default=0)
+    parser.add_argument("--cuda_mem_frac", type=float, default=1.0)
+    parser.add_argument("--offload_inference_models", action="store_true", default=False)
+    parser.add_argument("--use_kernels", action="store_true", default=False)
    args = parser.parse_args()
    main(args)