Merge branch 'main' of https://github.com/hpcaitech/ColossalAI

9e768b59 · zhuwenwen · 7bc5a8e3 · 8aed02b9 · 9e768b59 · 9e768b59
Commit 9e768b59 authored Oct 10, 2023 by zhuwenwen
20 changed files
--- a/applications/Chat/inference/server.py
+++ b/applications/Chat/inference/server.py
 import argparse
 import os
 from threading import Lock
-from typing import Dict, Generator, List, Optional
+from typing import Generator, List, Optional

 import torch
 import uvicorn
-from fastapi import FastAPI, HTTPException, Request
+from coati.quant import llama_load_quant, low_resource_init
+from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from llama_gptq import load_quant
 from pydantic import BaseModel, Field
 from slowapi import Limiter, _rate_limit_exceeded_handler
 from slowapi.errors import RateLimitExceeded
 from slowapi.util import get_remote_address
 from sse_starlette.sse import EventSourceResponse
-from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM
-from utils import ChatPromptProcessor, Dialogue, LockedIterator, sample_streamingly, update_model_kwargs_fn, load_json
+from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
+from utils import ChatPromptProcessor, Dialogue, LockedIterator, load_json, sample_streamingly, update_model_kwargs_fn

-CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.'
+CONTEXT = "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions."
 MAX_LEN = 512
 running_lock = Lock()

@@ -36,11 +36,11 @@ app.state.limiter = limiter
 app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)

 # set CORS
-origin_spec_from_env = os.environ.get('CORS_ORIGIN', None)
+origin_spec_from_env = os.environ.get("CORS_ORIGIN", None)

 if origin_spec_from_env is not None:
    # allow CORS from the specified origins
-    origins = os.environ['CORS_ORIGIN'].split(',')
+    origins = os.environ["CORS_ORIGIN"].split(",")
 else:
    # allow CORS from all origins
    origins = ["*"]
@@ -56,15 +56,15 @@ app.add_middleware(

 def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature):
    inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
-    #TODO(ver217): streaming generation does not support repetition_penalty now
+    # TODO(ver217): streaming generation does not support repetition_penalty now
    model_kwargs = {
-        'max_generate_tokens': max_new_tokens,
-        'early_stopping': True,
-        'top_k': top_k,
-        'top_p': top_p,
-        'temperature': temperature,
-        'prepare_inputs_fn': model.prepare_inputs_for_generation,
-        'update_model_kwargs_fn': update_model_kwargs_fn,
+        "max_generate_tokens": max_new_tokens,
+        "early_stopping": True,
+        "top_k": top_k,
+        "top_p": top_p,
+        "temperature": temperature,
+        "prepare_inputs_fn": model.prepare_inputs_for_generation,
+        "update_model_kwargs_fn": update_model_kwargs_fn,
    }
    is_first_word = True
    generator = LockedIterator(sample_streamingly(model, **inputs, **model_kwargs), running_lock)
@@ -81,9 +81,9 @@ def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature):
            if is_first_word:
                out_string = out_string.lstrip()
                is_first_word = False
-            elif current_sub_tokens[0].startswith('▁'):
+            elif current_sub_tokens[0].startswith("▁"):
                # whitespace will be ignored by the frontend
-                out_string = ' ' + out_string
+                out_string = " " + out_string
            yield out_string


@@ -92,32 +92,33 @@ async def event_generator(request: Request, generator: Generator):
        if await request.is_disconnected():
            break
        try:
-            yield {'event': 'generate', 'data': next(generator)}
+            yield {"event": "generate", "data": next(generator)}
        except StopIteration:
-            yield {'event': 'end', 'data': ''}
+            yield {"event": "end", "data": ""}
            break


-@app.post('/generate/stream')
-@limiter.limit('1/second')
+@app.post("/generate/stream")
+@limiter.limit("1/second")
 def generate(data: GenerationTaskReq, request: Request):
    prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
    event_source = event_generator(
-        request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature))
+        request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature)
+    )
    return EventSourceResponse(event_source)


-@app.post('/generate')
-@limiter.limit('1/second')
+@app.post("/generate")
+@limiter.limit("1/second")
 def generate_no_stream(data: GenerationTaskReq, request: Request):
    prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
    if prompt_processor.has_censored_words(prompt):
        return prompt_processor.SAFE_RESPONSE
    inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
    with running_lock:
-        output = model.generate(**inputs, **data.dict(exclude={'history'}))
+        output = model.generate(**inputs, **data.dict(exclude={"history"}))
    output = output.cpu()
-    prompt_len = inputs['input_ids'].size(1)
+    prompt_len = inputs["input_ids"].size(1)
    response = output[0, prompt_len:]
    out_string = tokenizer.decode(response, skip_special_tokens=True)
    out_string = prompt_processor.postprocess_output(out_string)
@@ -126,30 +127,40 @@ def generate_no_stream(data: GenerationTaskReq, request: Request):
    return out_string


-if __name__ == '__main__':
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        'pretrained',
-        help='Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.')
-    parser.add_argument('--quant',
-                        choices=['8bit', '4bit'],
-                        default=None,
-                        help='Quantization mode. Default: None (no quantization, fp16).')
+        "pretrained",
+        help="Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.",
+    )
    parser.add_argument(
-        '--gptq_checkpoint',
+        "--quant",
+        choices=["8bit", "4bit"],
        default=None,
-        help='Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.')
-    parser.add_argument('--gptq_group_size',
-                        type=int,
-                        default=128,
-                        help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.')
-    parser.add_argument('--http_host', default='0.0.0.0')
-    parser.add_argument('--http_port', type=int, default=7070)
-    parser.add_argument('--profanity_file', default=None, help='Path to profanity words list. It should be a JSON file containing a list of words.')
+        help="Quantization mode. Default: None (no quantization, fp16).",
+    )
+    parser.add_argument(
+        "--gptq_checkpoint",
+        default=None,
+        help="Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.",
+    )
+    parser.add_argument(
+        "--gptq_group_size",
+        type=int,
+        default=128,
+        help="Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.",
+    )
+    parser.add_argument("--http_host", default="0.0.0.0")
+    parser.add_argument("--http_port", type=int, default=7070)
+    parser.add_argument(
+        "--profanity_file",
+        default=None,
+        help="Path to profanity words list. It should be a JSON file containing a list of words.",
+    )
    args = parser.parse_args()

-    if args.quant == '4bit':
-        assert args.gptq_checkpoint is not None, 'Please specify a GPTQ checkpoint.'
+    if args.quant == "4bit":
+        assert args.gptq_checkpoint is not None, "Please specify a GPTQ checkpoint."

    tokenizer = AutoTokenizer.from_pretrained(args.pretrained)

@@ -159,18 +170,21 @@ if __name__ == '__main__':
        censored_words = []
    prompt_processor = ChatPromptProcessor(tokenizer, CONTEXT, MAX_LEN, censored_words=censored_words)

-    if args.quant == '4bit':
-        model = load_quant(args.pretrained, args.gptq_checkpoint, 4, args.gptq_group_size)
+    if args.quant == "4bit":
+        with low_resource_init():
+            config = LlamaConfig.from_pretrained(args.pretrained)
+            model = LlamaForCausalLM(config)
+        model = llama_load_quant(model, args.gptq_checkpoint, 4, args.gptq_group_size)
        model.cuda()
    else:
        model = LlamaForCausalLM.from_pretrained(
            args.pretrained,
-            load_in_8bit=(args.quant == '8bit'),
+            load_in_8bit=(args.quant == "8bit"),
            torch_dtype=torch.float16,
            device_map="auto",
        )
-        if args.quant != '8bit':
-            model.half()    # seems to fix bugs for some users.
+        if args.quant != "8bit":
+            model.half()  # seems to fix bugs for some users.
        model.eval()

    config = uvicorn.Config(app, host=args.http_host, port=args.http_port)

--- a/applications/Chat/inference/tests/test_chat_prompt.py
+++ b/applications/Chat/inference/tests/test_chat_prompt.py
@@ -3,44 +3,49 @@ import os
 from transformers import AutoTokenizer
 from utils import ChatPromptProcessor, Dialogue

-CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.'
-tokenizer = AutoTokenizer.from_pretrained(os.environ['PRETRAINED_PATH'])
+CONTEXT = "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions."
+tokenizer = AutoTokenizer.from_pretrained(os.environ["PRETRAINED_PATH"])

 samples = [
-    ([
-        Dialogue(
-            instruction='Who is the best player in the history of NBA?',
-            response=
-            'The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
-        ),
-        Dialogue(instruction='continue this talk', response=''),
-    ], 128,
-     'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\nThe best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1\n\n### Instruction:\ncontinue this talk\n\n### Response:\n'
+    (
+        [
+            Dialogue(
+                instruction="Who is the best player in the history of NBA?",
+                response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
+            ),
+            Dialogue(instruction="continue this talk", response=""),
+        ],
+        128,
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\nThe best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1\n\n### Instruction:\ncontinue this talk\n\n### Response:\n",
    ),
-    ([
-        Dialogue(
-            instruction='Who is the best player in the history of NBA?',
-            response=
-            'The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
-        ),
-        Dialogue(instruction='continue this talk', response=''),
-    ], 200,
-     'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this talk\n\n### Response:\n'
+    (
+        [
+            Dialogue(
+                instruction="Who is the best player in the history of NBA?",
+                response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
+            ),
+            Dialogue(instruction="continue this talk", response=""),
+        ],
+        200,
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this talk\n\n### Response:\n",
    ),
-    ([
-        Dialogue(
-            instruction='Who is the best player in the history of NBA?',
-            response=
-            'The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
-        ),
-        Dialogue(instruction='continue this talk', response=''),
-    ], 211,
-     'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this\n\n### Response:\n'
+    (
+        [
+            Dialogue(
+                instruction="Who is the best player in the history of NBA?",
+                response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
+            ),
+            Dialogue(instruction="continue this talk", response=""),
+        ],
+        211,
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this\n\n### Response:\n",
    ),
-    ([
-        Dialogue(instruction='Who is the best player in the history of NBA?', response=''),
-    ], 128,
-     'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\n'
+    (
+        [
+            Dialogue(instruction="Who is the best player in the history of NBA?", response=""),
+        ],
+        128,
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\n",
    ),
 ]

@@ -52,5 +57,5 @@ def test_chat_prompt_processor():
        assert prompt == result


-if __name__ == '__main__':
+if __name__ == "__main__":
    test_chat_prompt_processor()
--- a/applications/Chat/inference/utils.py
+++ b/applications/Chat/inference/utils.py
+import json
 import re
 from threading import Lock
 from typing import Any, Callable, Generator, List, Optional
-import json
-import jieba

+import jieba
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -20,9 +20,9 @@ except ImportError:
    from transformers.generation import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper


-def prepare_logits_processor(top_k: Optional[int] = None,
-                             top_p: Optional[float] = None,
-                             temperature: Optional[float] = None) -> LogitsProcessorList:
+def prepare_logits_processor(
+    top_k: Optional[int] = None, top_p: Optional[float] = None, temperature: Optional[float] = None
+) -> LogitsProcessorList:
    processor_list = LogitsProcessorList()
    if temperature is not None and temperature != 1.0:
        processor_list.append(TemperatureLogitsWarper(temperature))
@@ -41,29 +41,30 @@ def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
    return unfinished_sequences.max() == 0


-def sample_streamingly(model: nn.Module,
-                       input_ids: torch.Tensor,
-                       max_generate_tokens: int,
-                       early_stopping: bool = False,
-                       eos_token_id: Optional[int] = None,
-                       pad_token_id: Optional[int] = None,
-                       top_k: Optional[int] = None,
-                       top_p: Optional[float] = None,
-                       temperature: Optional[float] = None,
-                       prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
-                       update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
-                       **model_kwargs) -> Generator:
-
+def sample_streamingly(
+    model: nn.Module,
+    input_ids: torch.Tensor,
+    max_generate_tokens: int,
+    early_stopping: bool = False,
+    eos_token_id: Optional[int] = None,
+    pad_token_id: Optional[int] = None,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    temperature: Optional[float] = None,
+    prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
+    update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+    **model_kwargs,
+) -> Generator:
    logits_processor = prepare_logits_processor(top_k, top_p, temperature)
    unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)

    for _ in range(max_generate_tokens):
-        model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {
-            'input_ids': input_ids
-        }
+        model_inputs = (
+            prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {"input_ids": input_ids}
+        )
        outputs = model(**model_inputs)

-        next_token_logits = outputs['logits'][:, -1, :]
+        next_token_logits = outputs["logits"][:, -1, :]
        # pre-process distribution
        next_token_logits = logits_processor(input_ids, next_token_logits)
        # sample
@@ -107,27 +108,28 @@ def update_model_kwargs_fn(outputs: dict, **model_kwargs) -> dict:
    if "attention_mask" in model_kwargs:
        attention_mask = model_kwargs["attention_mask"]
        model_kwargs["attention_mask"] = torch.cat(
-            [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+            [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+        )

    return model_kwargs


 class Dialogue(BaseModel):
-    instruction: str = Field(min_length=1, example='Count up from 1 to 500.')
-    response: str = Field(example='')
+    instruction: str = Field(min_length=1, example="Count up from 1 to 500.")
+    response: str = Field(example="")


-def _format_dialogue(instruction: str, response: str = ''):
-    return f'\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}'
+def _format_dialogue(instruction: str, response: str = ""):
+    return f"\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}"


-STOP_PAT = re.compile(r'(###|instruction:).*', flags=(re.I | re.S))
+STOP_PAT = re.compile(r"(###|instruction:).*", flags=(re.I | re.S))


 class ChatPromptProcessor:
-    SAFE_RESPONSE = 'The input/response contains inappropriate content, please rephrase your prompt.'
+    SAFE_RESPONSE = "The input/response contains inappropriate content, please rephrase your prompt."

-    def __init__(self, tokenizer, context: str, max_len: int = 2048, censored_words: List[str]=[]):
+    def __init__(self, tokenizer, context: str, max_len: int = 2048, censored_words: List[str] = []):
        self.tokenizer = tokenizer
        self.context = context
        self.max_len = max_len
@@ -138,42 +140,48 @@ class ChatPromptProcessor:

    def preprocess_prompt(self, history: List[Dialogue], max_new_tokens: int) -> str:
        if self.context_len is None:
-            self.context_len = len(self.tokenizer(self.context)['input_ids'])
+            self.context_len = len(self.tokenizer(self.context)["input_ids"])
        if self.dialogue_placeholder_len is None:
            self.dialogue_placeholder_len = len(
-                self.tokenizer(_format_dialogue(''), add_special_tokens=False)['input_ids'])
+                self.tokenizer(_format_dialogue(""), add_special_tokens=False)["input_ids"]
+            )
        prompt = self.context
        # the last dialogue must be in the prompt
        last_dialogue = history.pop()
        # the response of the last dialogue is empty
-        assert last_dialogue.response == ''
-        if len(self.tokenizer(_format_dialogue(last_dialogue.instruction), add_special_tokens=False)
-               ['input_ids']) + max_new_tokens + self.context_len >= self.max_len:
+        assert last_dialogue.response == ""
+        if (
+            len(self.tokenizer(_format_dialogue(last_dialogue.instruction), add_special_tokens=False)["input_ids"])
+            + max_new_tokens
+            + self.context_len
+            >= self.max_len
+        ):
            # to avoid truncate placeholder, apply truncate to the original instruction
-            instruction_truncated = self.tokenizer(last_dialogue.instruction,
-                                                   add_special_tokens=False,
-                                                   truncation=True,
-                                                   max_length=(self.max_len - max_new_tokens - self.context_len -
-                                                               self.dialogue_placeholder_len))['input_ids']
+            instruction_truncated = self.tokenizer(
+                last_dialogue.instruction,
+                add_special_tokens=False,
+                truncation=True,
+                max_length=(self.max_len - max_new_tokens - self.context_len - self.dialogue_placeholder_len),
+            )["input_ids"]
            instruction_truncated = self.tokenizer.decode(instruction_truncated).lstrip()
            prompt += _format_dialogue(instruction_truncated)
            return prompt

-        res_len = self.max_len - max_new_tokens - len(self.tokenizer(prompt)['input_ids'])
+        res_len = self.max_len - max_new_tokens - len(self.tokenizer(prompt)["input_ids"])

        rows = []
        for dialogue in history[::-1]:
            text = _format_dialogue(dialogue.instruction, dialogue.response)
-            cur_len = len(self.tokenizer(text, add_special_tokens=False)['input_ids'])
+            cur_len = len(self.tokenizer(text, add_special_tokens=False)["input_ids"])
            if res_len - cur_len < 0:
                break
            res_len -= cur_len
            rows.insert(0, text)
-        prompt += ''.join(rows) + _format_dialogue(last_dialogue.instruction)
+        prompt += "".join(rows) + _format_dialogue(last_dialogue.instruction)
        return prompt

    def postprocess_output(self, output: str) -> str:
-        output = STOP_PAT.sub('', output)
+        output = STOP_PAT.sub("", output)
        return output.strip()

    def has_censored_words(self, text: str) -> bool:
@@ -182,8 +190,8 @@ class ChatPromptProcessor:
        intersection = set(jieba.cut(text.lower())) & self.censored_words
        return len(intersection) > 0

-class LockedIterator:

+class LockedIterator:
    def __init__(self, it, lock: Lock) -> None:
        self.lock = lock
        self.it = iter(it)
@@ -195,6 +203,7 @@ class LockedIterator:
        with self.lock:
            return next(self.it)

+
 def load_json(path: str):
    with open(path) as f:
-        return json.load(f)
\ No newline at end of file
+        return json.load(f)
--- a/applications/Chat/requirements-test.txt
+++ b/applications/Chat/requirements-test.txt
 pytest
+colossalai==0.3.3
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.20.1
 tqdm
 datasets
 loralib
-colossalai>=0.2.4
+colossalai==0.3.3
 torch<2.0.0, >=1.12.1
 langchain
 tokenizers
@@ -11,3 +11,4 @@ sse_starlette
 wandb
 sentencepiece
 gpustat
+tensorboard
--- a/applications/Chat/setup.py
+++ b/applications/Chat/setup.py
@@ -2,40 +2,42 @@ from setuptools import find_packages, setup


 def fetch_requirements(path):
-    with open(path, 'r') as fd:
+    with open(path, "r") as fd:
        return [r.strip() for r in fd.readlines()]


 def fetch_readme():
-    with open('README.md', encoding='utf-8') as f:
+    with open("README.md", encoding="utf-8") as f:
        return f.read()


 def fetch_version():
-    with open('version.txt', 'r') as f:
+    with open("version.txt", "r") as f:
        return f.read().strip()


 setup(
-    name='coati',
+    name="coati",
    version=fetch_version(),
-    packages=find_packages(exclude=(
-        'tests',
-        'benchmarks',
-        '*.egg-info',
-    )),
-    description='Colossal-AI Talking Intelligence',
+    packages=find_packages(
+        exclude=(
+            "tests",
+            "benchmarks",
+            "*.egg-info",
+        )
+    ),
+    description="Colossal-AI Talking Intelligence",
    long_description=fetch_readme(),
-    long_description_content_type='text/markdown',
-    license='Apache Software License 2.0',
-    url='https://github.com/hpcaitech/Coati',
-    install_requires=fetch_requirements('requirements.txt'),
-    python_requires='>=3.6',
+    long_description_content_type="text/markdown",
+    license="Apache Software License 2.0",
+    url="https://github.com/hpcaitech/Coati",
+    install_requires=fetch_requirements("requirements.txt"),
+    python_requires=">=3.6",
    classifiers=[
-        'Programming Language :: Python :: 3',
-        'License :: OSI Approved :: Apache Software License',
-        'Environment :: GPU :: NVIDIA CUDA',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: System :: Distributed Computing',
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Environment :: GPU :: NVIDIA CUDA",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: System :: Distributed Computing",
    ],
 )
--- a/applications/Chat/tests/test_benchmarks.sh
+++ b/applications/Chat/tests/test_benchmarks.sh
+#!/bin/bash
+
+set -xue
+
+echo "Hint: You can run this script with 'verbose' as the first argument to run all strategies."
+
+if [[ $# -ne 0 && "$1" == "verbose" ]]; then
+    STRATEGIES=(
+        'ddp'
+        'colossalai_gemini'
+        'colossalai_gemini_cpu'
+        'colossalai_zero2'
+        'colossalai_zero2_cpu'
+        'colossalai_zero1'
+        'colossalai_zero1_cpu'
+    )
+else
+    STRATEGIES=(
+        'colossalai_zero2'
+    )
+fi
+
+BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
+BENCHMARKS_DIR=$BASE_DIR/benchmarks
+
+echo "[Test]: testing benchmarks ..."
+
+for strategy in ${STRATEGIES[@]}; do
+    torchrun --standalone --nproc_per_node 1 $BENCHMARKS_DIR/benchmark_opt_lora_dummy.py \
+        --model 125m --critic_model 125m --strategy ${strategy} --lora_rank 4 \
+        --num_episodes 2 --num_collect_steps 4 --num_update_steps 2 \
+        --train_batch_size 2 --experience_batch_size 4
+done
--- a/applications/Chat/tests/test_checkpoint.py
+++ b/applications/Chat/tests/test_checkpoint.py
@@ -6,7 +6,8 @@ import pytest
 import torch
 import torch.distributed as dist
 from coati.models.gpt import GPTActor
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy
+from coati.models.utils import calc_action_log_probs
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config

 from colossalai.nn.optimizer import HybridAdam
@@ -16,39 +17,37 @@ GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)


 def get_data(batch_size: int, seq_len: int = 10) -> dict:
-    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
+    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device="cuda")
    attention_mask = torch.ones_like(input_ids)
    return dict(input_ids=input_ids, attention_mask=attention_mask)


-def run_test_checkpoint(strategy):
-    BATCH_SIZE = 2
+def train_step(strategy: Strategy, actor: GPTActor, actor_optim: HybridAdam, batch_size: int = 8):
+    data = get_data(batch_size)
+    action_mask = torch.ones_like(data["attention_mask"], dtype=torch.bool)
+    actor_logits = actor(data["input_ids"], data["attention_mask"])["logits"]
+    action_log_probs = calc_action_log_probs(actor_logits, data["input_ids"], action_mask.size(1))
+    loss = action_log_probs.sum()
+    strategy.backward(loss, actor, actor_optim)
+    strategy.optimizer_step(actor_optim)

-    if strategy == 'ddp':
+
+def run_test_checkpoint(strategy_name: str, shard: bool):
+    if strategy_name == "ddp":
        strategy = DDPStrategy()
-    elif strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
-    elif strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    elif strategy_name == "colossalai_gemini":
+        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+    elif strategy_name == "colossalai_zero2":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
    else:
-        raise ValueError(f'Unsupported strategy "{strategy}"')
+        raise ValueError(f"Unsupported strategy '{strategy_name}'")

    with strategy.model_init_context():
        actor = GPTActor(config=GPT_CONFIG).cuda()
-
    actor_optim = HybridAdam(actor.parameters())
-
    actor, actor_optim = strategy.prepare((actor, actor_optim))

-    def run_step():
-        data = get_data(BATCH_SIZE)
-        action_mask = torch.ones_like(data['attention_mask'], dtype=torch.bool)
-        action_log_probs = actor(data['input_ids'], action_mask.size(1), data['attention_mask'])
-        loss = action_log_probs.sum()
-        strategy.backward(loss, actor, actor_optim)
-        strategy.optimizer_step(actor_optim)
-
-    run_step()
+    train_step(strategy, actor, actor_optim)

    ctx = tempfile.TemporaryDirectory() if dist.get_rank() == 0 else nullcontext()

@@ -57,38 +56,36 @@ def run_test_checkpoint(strategy):
        dist.broadcast_object_list(rank0_dirname)
        rank0_dirname = rank0_dirname[0]

-        model_path = os.path.join(rank0_dirname, 'model.pt')
-        optim_path = os.path.join(rank0_dirname, f'optim-r{dist.get_rank()}.pt')
-
-        strategy.save_model(actor, model_path, only_rank0=True)
-        strategy.save_optimizer(actor_optim, optim_path, only_rank0=False)
-
+        model_path = os.path.join(rank0_dirname, "model" if shard else f"model.pt")
+        strategy.save_model(actor, model_path)
+        optim_path = os.path.join(rank0_dirname, "optim" if shard else "optim.pt")
+        strategy.save_optimizer(actor_optim, optim_path)
        dist.barrier()

        strategy.load_model(actor, model_path, strict=False)
        strategy.load_optimizer(actor_optim, optim_path)
-
        dist.barrier()

-    run_step()
+    train_step(strategy, actor, actor_optim)


-def run_dist(rank, world_size, port, strategy):
-    os.environ['RANK'] = str(rank)
-    os.environ['LOCAL_RANK'] = str(rank)
-    os.environ['WORLD_SIZE'] = str(world_size)
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = str(port)
-    run_test_checkpoint(strategy)
+def run_dist(rank: int, world_size: int, port: int, strategy_name: str, shard: bool):
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(port)
+    run_test_checkpoint(strategy_name, shard)


 @pytest.mark.dist
-@pytest.mark.parametrize('world_size', [2])
-@pytest.mark.parametrize('strategy', ['ddp', 'colossalai_zero2', 'colossalai_gemini'])
+@pytest.mark.parametrize("world_size", [4])
+@pytest.mark.parametrize("strategy_name", ["ddp", "colossalai_gemini", "colossalai_zero2"])
+@pytest.mark.parametrize("shard", [False, True])
 @rerun_if_address_is_in_use()
-def test_checkpoint(world_size, strategy):
-    spawn(run_dist, world_size, strategy=strategy)
+def test_checkpoint(world_size: int, strategy_name: str, shard: bool):
+    spawn(run_dist, world_size, strategy_name=strategy_name, shard=shard)


-if __name__ == '__main__':
-    test_checkpoint(2, 'colossalai_zero2')
+if __name__ == "__main__":
+    test_checkpoint(2, "colossalai_gemini", shard=False)
--- a/applications/Chat/tests/test_dataset.py
+++ b/applications/Chat/tests/test_dataset.py
+import json
+import os
+import tempfile
+from typing import Optional
+
+import pytest
+import torch
+from coati.dataset.prompt_dataset import PromptDataset
+from coati.dataset.reward_dataset import HhRlhfDataset, RmStaticDataset
+from coati.dataset.sft_dataset import IGNORE_INDEX, SFTDataset, SupervisedDataset
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+from datasets import load_dataset
+from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer, PreTrainedTokenizer
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+SFT_DATASET = [
+    {
+        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
+        "input": "",
+        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
+        "id": 0,
+    },
+    {
+        "instruction": "Please provide an action plan for reducing carbon footprint on a corporate level",
+        "input": "",
+        "output": "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
+        "id": 1,
+    },
+    {
+        "instruction": "Write a persuasive email to your boss explaining why you should have a pay raise",
+        "input": "",
+        "output": "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
+        "id": 2,
+    },
+]
+
+PROMPT_DATASET = [
+    {
+        "instruction": 'Edit this paragraph to make it more concise: "Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends."',
+        "id": 0,
+    },
+    {"instruction": "Write a descriptive paragraph about a memorable vacation you went on", "id": 1},
+    {"instruction": "Write a persuasive essay arguing why homework should be banned in schools", "id": 2},
+    {"instruction": "Create a chart comparing the statistics on student debt in the United States.", "id": 3},
+]
+
+
+def make_tokenizer(model: str):
+    if model == "gpt2":
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        tokenizer.pad_token = tokenizer.eos_token
+    elif model == "bloom":
+        tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
+        tokenizer.pad_token = tokenizer.eos_token
+    elif model == "opt":
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+        tokenizer.pad_token = tokenizer.eos_token
+    elif model == "llama":
+        tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+        tokenizer.pad_token = tokenizer.unk_token
+    elif model == "chatglm":
+        tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    else:
+        raise ValueError(f"Unsupported model '{model}'")
+    return tokenizer
+
+
+def check_content(input_ids_stripped: torch.Tensor, tokenizer: PreTrainedTokenizer, model: str):
+    if model == "opt":
+        # NOTE:  Contrary to GPT2, OPT adds the EOS token </s> to the beginning of every prompt.
+        assert input_ids_stripped[0] == tokenizer.eos_token_id
+        input_ids_stripped = input_ids_stripped[1:]
+    elif model == "llama":
+        assert input_ids_stripped[0] == tokenizer.bos_token_id
+        input_ids_stripped = input_ids_stripped[1:]
+    elif model == "chatglm":
+        assert input_ids_stripped[0] == tokenizer.bos_token_id
+        assert input_ids_stripped[-1] == tokenizer.eos_token_id
+        input_ids_stripped = input_ids_stripped[1:-1]
+    assert torch.all(input_ids_stripped != tokenizer.pad_token_id)
+    assert torch.all(input_ids_stripped != tokenizer.bos_token_id)
+    assert torch.all(input_ids_stripped != tokenizer.eos_token_id)
+    assert input_ids_stripped != tokenizer.sep_token_id
+    assert input_ids_stripped != tokenizer.cls_token_id
+    if model == "chatglm":
+        assert torch.all(input_ids_stripped != tokenizer.mask_token_id)
+    else:
+        assert input_ids_stripped != tokenizer.mask_token_id
+
+
+@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
+@pytest.mark.parametrize("max_length", [32, 1024])
+@pytest.mark.parametrize("max_datasets_size", [2])
+def test_prompt_dataset(model: str, max_datasets_size: int, max_length: int):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        dataset_name = "prompt_dataset.json"
+        with open(os.path.join(tmp_dir, dataset_name), "w") as f:
+            json.dump(PROMPT_DATASET, f)
+        tokenizer = make_tokenizer(model)
+        assert tokenizer.padding_side in ("left", "right")
+        prompt_dataset = PromptDataset(
+            data_path=os.path.join(tmp_dir, dataset_name),
+            tokenizer=tokenizer,
+            max_datasets_size=max_datasets_size,
+            max_length=max_length,
+        )
+        assert len(prompt_dataset) == min(max_datasets_size, len(PROMPT_DATASET))
+        for i in range(len(prompt_dataset)):
+            assert isinstance(prompt_dataset[i], dict)
+            assert list(prompt_dataset[i].keys()) == ["input_ids", "attention_mask"]
+            input_ids = prompt_dataset[i]["input_ids"]
+            attention_mask = prompt_dataset[i]["attention_mask"]
+            attention_mask = attention_mask.bool()
+            assert input_ids.shape == attention_mask.shape == torch.Size([max_length])
+            assert torch.all(input_ids[torch.logical_not(attention_mask)] == tokenizer.pad_token_id)
+            check_content(input_ids.masked_select(attention_mask), tokenizer, model)
+
+
+@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
+@pytest.mark.parametrize(
+    ["dataset_path", "subset"], [("Anthropic/hh-rlhf", "harmless-base"), ("Dahoas/rm-static", None)]
+)
+@pytest.mark.parametrize("max_datasets_size", [32])
+@pytest.mark.parametrize("max_length", [32, 1024])
+def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], max_datasets_size: int, max_length: int):
+    data = load_dataset(dataset_path, data_dir=subset)
+    assert max_datasets_size <= len(data["train"]) and max_datasets_size <= len(data["test"])
+    train_data = data["train"].select(range(max_datasets_size))
+    test_data = data["test"].select(range(max_datasets_size))
+    tokenizer = make_tokenizer(model)
+    assert tokenizer.padding_side in ("left", "right")
+
+    if dataset_path == "Anthropic/hh-rlhf":
+        train_dataset = HhRlhfDataset(train_data, tokenizer, max_length)
+        test_dataset = HhRlhfDataset(test_data, tokenizer, max_length)
+    elif dataset_path == "Dahoas/rm-static":
+        train_dataset = RmStaticDataset(train_data, tokenizer, max_length)
+        test_dataset = RmStaticDataset(test_data, tokenizer, max_length)
+    else:
+        raise ValueError(f'Unsupported dataset "{dataset_path}"')
+
+    assert len(train_dataset) == len(test_dataset) == max_datasets_size
+    for i in range(max_datasets_size):
+        chosen_ids, c_mask, reject_ids, r_mask = train_dataset[i]
+        assert chosen_ids.shape == c_mask.shape == reject_ids.shape == r_mask.shape == torch.Size([max_length])
+        c_mask = c_mask.to(torch.bool)
+        r_mask = r_mask.to(torch.bool)
+        if chosen_ids.masked_select(c_mask)[-1] == tokenizer.eos_token_id:
+            check_content(chosen_ids.masked_select(c_mask)[:-1], tokenizer, model)
+            assert torch.all(chosen_ids.masked_select(torch.logical_not(c_mask)) == tokenizer.pad_token_id)
+        else:
+            check_content(chosen_ids.masked_select(c_mask), tokenizer, model)
+            assert torch.all(c_mask)
+        if reject_ids.masked_select(r_mask)[-1] == tokenizer.eos_token_id:
+            check_content(reject_ids.masked_select(r_mask)[:-1], tokenizer, model)
+            assert torch.all(reject_ids.masked_select(torch.logical_not(r_mask)) == tokenizer.pad_token_id)
+        else:
+            check_content(reject_ids.masked_select(r_mask), tokenizer, model)
+            assert torch.all(r_mask)
+
+        chosen_ids, c_mask, reject_ids, r_mask = test_dataset[i]
+        assert chosen_ids.shape == c_mask.shape == reject_ids.shape == r_mask.shape == torch.Size([max_length])
+        c_mask = c_mask.to(torch.bool)
+        r_mask = r_mask.to(torch.bool)
+        if chosen_ids.masked_select(c_mask)[-1] == tokenizer.eos_token_id:
+            check_content(chosen_ids.masked_select(c_mask)[:-1], tokenizer, model)
+            assert torch.all(chosen_ids.masked_select(torch.logical_not(c_mask)) == tokenizer.pad_token_id)
+        else:
+            check_content(chosen_ids.masked_select(c_mask), tokenizer, model)
+            assert torch.all(c_mask)
+        if reject_ids.masked_select(r_mask)[-1] == tokenizer.eos_token_id:
+            check_content(reject_ids.masked_select(r_mask)[:-1], tokenizer, model)
+            assert torch.all(reject_ids.masked_select(torch.logical_not(r_mask)) == tokenizer.pad_token_id)
+        else:
+            check_content(reject_ids.masked_select(r_mask), tokenizer, model)
+            assert torch.all(r_mask)
+
+
+@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama", "chatglm"])
+@pytest.mark.parametrize("dataset_path", ["yizhongw/self_instruct", None])
+@pytest.mark.parametrize("max_dataset_size", [2])
+@pytest.mark.parametrize("max_length", [32, 1024])
+def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size: int, max_length: int):
+    tokenizer = make_tokenizer(model)
+    if dataset_path == "yizhongw/self_instruct":
+        data = load_dataset(dataset_path, "super_natural_instructions")
+        train_data = data["train"].select(range(max_dataset_size))
+        sft_dataset = SFTDataset(train_data, tokenizer, max_length)
+    else:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            dataset_name = "sft_dataset.json"
+            with open(os.path.join(tmp_dir, dataset_name), "w") as f:
+                json.dump(SFT_DATASET, f)
+            sft_dataset = SupervisedDataset(
+                tokenizer=tokenizer,
+                data_path=os.path.join(tmp_dir, dataset_name),
+                max_datasets_size=max_dataset_size,
+                max_length=max_length,
+            )
+        assert len(sft_dataset) == min(max_dataset_size, len(SFT_DATASET))
+
+    if isinstance(tokenizer, ChatGLMTokenizer):
+        for i in range(max_dataset_size):
+            assert isinstance(sft_dataset[i], dict)
+            assert list(sft_dataset[i].keys()) == ["input_ids", "labels"]
+            input_ids = sft_dataset[i]["input_ids"]
+            labels = sft_dataset[i]["labels"]
+            assert input_ids.shape == labels.shape == torch.Size([max_length])
+
+            ignore_mask = labels == IGNORE_INDEX
+            assert input_ids.masked_select(torch.logical_not(ignore_mask))[0] == tokenizer.bos_token_id
+            check_content(input_ids.masked_select(torch.logical_not(ignore_mask)), tokenizer, model)
+            return
+
+    for i in range(max_dataset_size):
+        assert isinstance(sft_dataset[i], dict)
+        assert list(sft_dataset[i].keys()) == ["input_ids", "labels", "attention_mask"]
+        input_ids = sft_dataset[i]["input_ids"]
+        labels = sft_dataset[i]["labels"]
+        attention_mask = sft_dataset[i]["attention_mask"].to(torch.bool)
+        assert input_ids.shape == labels.shape == attention_mask.shape == torch.Size([max_length])
+        if input_ids.masked_select(attention_mask)[-1] == tokenizer.eos_token_id:
+            check_content(input_ids.masked_select(attention_mask)[:-1], tokenizer, model)
+            assert torch.all(input_ids.masked_select(torch.logical_not(attention_mask)) == tokenizer.pad_token_id)
+        else:
+            check_content(input_ids.masked_select(attention_mask), tokenizer, model)
+            assert torch.all(attention_mask)
+        ignore_mask = labels == IGNORE_INDEX
+        prompt_mask = torch.logical_and(ignore_mask, attention_mask)
+        check_content(input_ids.masked_select(prompt_mask), tokenizer, model)
+        assert torch.all(input_ids.masked_select(ignore_mask ^ prompt_mask) == tokenizer.pad_token_id)
+
+
+if __name__ == "__main__":
+    test_sft_dataset(model="bloom", dataset_path="yizhongw/self_instruct", max_dataset_size=2, max_length=256)
+
+    test_reward_dataset(
+        model="gpt2", dataset_path="Anthropic/hh-rlhf", subset="harmless-base", max_datasets_size=8, max_length=256
+    )
+
+    test_prompt_dataset(model="opt", max_datasets_size=2, max_length=128)
--- a/applications/Chat/tests/test_data.py
+++ b/applications/Chat/tests/test_data.py
+import copy
 import os
-from copy import deepcopy

 import pytest
 import torch
 import torch.distributed as dist
+from coati.experience_buffer import NaiveExperienceBuffer
 from coati.experience_maker import NaiveExperienceMaker
 from coati.models.base import RewardModel
 from coati.models.gpt import GPTActor, GPTCritic
-from coati.replay_buffer import NaiveReplayBuffer
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy
+from coati.trainer.ppo import _set_default_generate_kwargs
+from coati.trainer.strategies import DDPStrategy, GeminiStrategy
+from coati.trainer.strategies.colossalai import LowLevelZeroStrategy
 from transformers.models.gpt2.configuration_gpt2 import GPT2Config

 from colossalai.testing import rerun_if_address_is_in_use, spawn
@@ -17,7 +19,7 @@ GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)


 def get_data(batch_size: int, seq_len: int = 10) -> dict:
-    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
+    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device="cuda")
    attention_mask = torch.ones_like(input_ids)
    return dict(input_ids=input_ids, attention_mask=attention_mask)

@@ -32,36 +34,47 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:
    return True


-def run_test_data(strategy):
-    EXPERINCE_BATCH_SIZE = 4
+def make_and_consume_experience(strategy):
+    EXPERIENCE_BATCH_SIZE = 4
    SAMPLE_BATCH_SIZE = 2

-    if strategy == 'ddp':
+    if strategy == "ddp":
        strategy = DDPStrategy()
-    elif strategy == 'colossalai':
-        strategy = ColossalAIStrategy(placement_policy='cuda')
+    elif strategy == "colossalai-zero2":
+        strategy = LowLevelZeroStrategy()
+    elif strategy == "colossalai-gemini":
+        strategy = GeminiStrategy(placement_policy="static")
    else:
        raise ValueError(f'Unsupported strategy "{strategy}"')

-    actor = GPTActor(config=GPT_CONFIG).cuda()
-    critic = GPTCritic(config=GPT_CONFIG).cuda()
+    with strategy.model_init_context():
+        actor = GPTActor(config=GPT_CONFIG).cuda()
+        critic = GPTCritic(config=GPT_CONFIG).cuda()

-    initial_model = deepcopy(actor)
-    reward_model = RewardModel(deepcopy(critic.model)).cuda()
+        initial_model = GPTActor(config=GPT_CONFIG).cuda()
+        reward_model = RewardModel(model=copy.deepcopy(critic.model)).cuda()

-    experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model)
-    replay_buffer = NaiveReplayBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
+    actor, critic, initial_model, reward_model = strategy.prepare(actor, critic, initial_model, reward_model)
+
+    class MockTokenizer:
+        def __init__(self):
+            self.padding_side = "left"
+            self.eos_token_id = 0
+            self.pad_token_id = 0
+
+    tokenizer = MockTokenizer()
+    experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, tokenizer)
+    data_buffer = NaiveExperienceBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
+
+    generate_kwargs = dict(do_sample=True, max_length=16)
+    generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)

    # experience of all ranks should be the same
    for _ in range(2):
-        data = get_data(EXPERINCE_BATCH_SIZE)
-        assert gather_and_equal(data['input_ids'])
-        assert gather_and_equal(data['attention_mask'])
-        experience = experience_maker.make_experience(**data,
-                                                      do_sample=True,
-                                                      max_length=16,
-                                                      eos_token_id=50256,
-                                                      pad_token_id=50256)
+        data = get_data(EXPERIENCE_BATCH_SIZE)
+        assert gather_and_equal(data["input_ids"])
+        assert gather_and_equal(data["attention_mask"])
+        experience = experience_maker.make_experience(**data, do_sample=True, max_length=16)
        assert gather_and_equal(experience.sequences)
        assert gather_and_equal(experience.action_log_probs)
        assert gather_and_equal(experience.values)
@@ -69,12 +82,12 @@ def run_test_data(strategy):
        assert gather_and_equal(experience.advantages)
        assert gather_and_equal(experience.action_mask)
        assert gather_and_equal(experience.attention_mask)
-        replay_buffer.append(experience)
+        data_buffer.append(experience)

-    # replay buffer's data should be the same
-    buffer_size = torch.tensor([len(replay_buffer)], device='cuda')
+    # data buffer's data should be the same
+    buffer_size = torch.tensor([len(data_buffer)], device="cuda")
    assert gather_and_equal(buffer_size)
-    for item in replay_buffer.items:
+    for item in data_buffer.items:
        assert gather_and_equal(item.sequences)
        assert gather_and_equal(item.action_log_probs)
        assert gather_and_equal(item.values)
@@ -84,8 +97,8 @@ def run_test_data(strategy):
        assert gather_and_equal(item.attention_mask)

    # dataloader of each rank should have the same size and different batch
-    dataloader = strategy.setup_dataloader(replay_buffer)
-    dataloader_size = torch.tensor([len(dataloader)], device='cuda')
+    dataloader = strategy.setup_dataloader(data_buffer)
+    dataloader_size = torch.tensor([len(dataloader)], device="cuda")
    assert gather_and_equal(dataloader_size)
    for experience in dataloader:
        assert not gather_and_equal(experience.sequences)
@@ -97,22 +110,21 @@ def run_test_data(strategy):


 def run_dist(rank, world_size, port, strategy):
-    os.environ['RANK'] = str(rank)
-    os.environ['LOCAL_RANK'] = str(rank)
-    os.environ['WORLD_SIZE'] = str(world_size)
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = str(port)
-    run_test_data(strategy)
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(port)
+    make_and_consume_experience(strategy)


-@pytest.mark.skip
 @pytest.mark.dist
-@pytest.mark.parametrize('world_size', [2])
-@pytest.mark.parametrize('strategy', ['ddp', 'colossalai'])
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize("strategy", ["ddp", "colossalai-zero2", "colossalai-gemini"])
 @rerun_if_address_is_in_use()
-def test_data(world_size, strategy):
+def test_experience(world_size, strategy):
    spawn(run_dist, world_size, strategy=strategy)


-if __name__ == '__main__':
-    test_data(2, 'colossalai')
+if __name__ == "__main__":
+    test_experience(2, "colossalai-zero2")
--- a/applications/Chat/tests/test_inference.sh
+++ b/applications/Chat/tests/test_inference.sh
+set -xue
+
+BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
+EXAMPLES_DIR=$BASE_DIR/examples
+
+echo "[Test]: testing inference ..."
+
+# HACK: skip llama due to oom
+for model in 'gpt2' 'bloom' 'opt'; do
+    python $EXAMPLES_DIR/inference.py --model $model
+done
--- a/applications/Chat/tests/test_models.py
+++ b/applications/Chat/tests/test_models.py
+import copy
+from typing import Any, Callable, Dict, Tuple
+
+import pytest
+import torch
+import torch.nn as nn
+from coati.models.base import Actor, Critic, RewardModel, get_base_model
+from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
+from coati.models.chatglm import ChatGLMActor
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+from coati.models.generation import generate
+from coati.models.gpt import GPTRM, GPTActor, GPTCritic
+from coati.models.llama import LlamaActor
+from coati.models.lora import LoraLinear, convert_to_lora_module
+from coati.models.loss import GPTLMLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
+from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.models.utils import calc_action_log_probs, masked_mean
+
+
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seq_len", [32])
+@pytest.mark.parametrize(
+    "actor_maker",
+    [
+        lambda: BLOOMActor(),
+        lambda: GPTActor(),
+        # HACK: skip llama due to long execution time
+        # lambda: LlamaActor(),
+        lambda: OPTActor(),
+    ],
+)
+@pytest.mark.parametrize(
+    "generate_kwargs",
+    [
+        {
+            "max_length": 64,
+            "use_cache": True,
+            "do_sample": True,
+            "temperature": 1.0,
+            "top_k": 50,
+        }
+    ],
+)
+def test_generation(actor_maker: Callable[[], Actor], batch_size: int, seq_len: int, generate_kwargs: Dict[str, Any]):
+    class MockTokenizer:
+        def __init__(self):
+            self.padding_side = "left"
+            self.eos_token_id = 0
+            self.pad_token_id = 0
+
+    actor = actor_maker()
+    input_ids = torch.randint(0, 100, (batch_size, seq_len)).cuda()
+    tokenizer = MockTokenizer()
+    sequences = generate(actor.cuda(), input_ids, tokenizer, **generate_kwargs)
+    assert sequences.shape == (batch_size, generate_kwargs["max_length"])
+
+
+def test_utils():
+    fn_input = {"tensor": torch.ones((10,)), "mask": torch.randint(0, 2, (10,))}
+    fn_output = masked_mean(dim=0, **fn_input)
+    assert fn_output.dim() == 0
+    assert torch.allclose(fn_output, torch.tensor(1.0))
+
+    batch_size = 4
+    seq_len = 32
+    num_labels = 10
+    num_actions = 2
+    fn_input = {
+        "logits": torch.randn((batch_size, seq_len, num_labels)),
+        "sequences": torch.randint(0, num_labels, (batch_size, seq_len)),
+        "num_actions": num_actions,
+    }
+    fn_output = calc_action_log_probs(**fn_input)
+    assert fn_output.shape == (batch_size, num_actions)
+
+
+@pytest.mark.parametrize("lora_rank", [4])
+@pytest.mark.parametrize("num_dim", [32])
+@pytest.mark.parametrize("num_layers", [4])
+def test_lora(lora_rank: int, num_dim: int, num_layers: int):
+    model = nn.ModuleList([nn.Linear(num_dim, num_dim) for _ in range(num_layers)])
+    lora_model = convert_to_lora_module(model, lora_rank)
+    assert isinstance(lora_model, nn.ModuleList)
+    for i in range(num_layers):
+        assert isinstance(lora_model[i], LoraLinear)
+        assert lora_model[i].lora_A.shape == (lora_rank, num_dim)
+        assert lora_model[i].lora_B.shape == (num_dim, lora_rank)
+
+    old_model = copy.deepcopy(lora_model)
+    for i in range(num_layers):
+        assert isinstance(lora_model[i], LoraLinear)
+        assert torch.allclose(old_model[i].weight, lora_model[i].weight)
+        assert torch.allclose(old_model[i].bias, lora_model[i].bias)
+        assert torch.allclose(old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A)
+    optimizer = torch.optim.Adam(lora_model.parameters())
+    x = torch.randn(8, num_dim)
+    for i in range(num_layers):
+        x = lora_model[i](x)
+    loss = x.sum()
+    loss.backward()
+    optimizer.step()
+    for i in range(num_layers):
+        assert isinstance(lora_model[i], LoraLinear)
+        assert torch.allclose(old_model[i].weight, lora_model[i].weight)
+        assert torch.allclose(old_model[i].bias, lora_model[i].bias)
+        assert not torch.allclose(
+            old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A
+        )
+
+
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [128])
+@pytest.mark.parametrize(
+    "models_maker",
+    [
+        lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()),
+        lambda: (GPTActor(), GPTCritic(), GPTRM()),
+        # HACK: skip llama due to long execution time
+        # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
+        lambda: (OPTActor(), OPTCritic(), OPTRM()),
+        lambda: (ChatGLMActor(), None, None),
+    ],
+)
+@torch.no_grad()
+def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]], batch_size: int, seq_len: int):
+    actor_input = {
+        "input_ids": torch.randint(0, 100, (batch_size, seq_len)),
+        "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
+    }
+    critic_input = {
+        "sequences": torch.randint(0, 100, (batch_size, seq_len)),
+        "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
+    }
+    rm_input = {
+        "sequences": torch.randint(0, 100, (batch_size, seq_len)),
+        "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
+    }
+
+    actor, critic, rm = models_maker()
+    if isinstance(actor, ChatGLMActor):
+        actor = actor.float()
+        tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+        chatglm_special_token = torch.tensor([tokenizer.gmask_token_id, tokenizer.bos_token_id]).repeat(batch_size, 1)
+        actor_input = {
+            "input_ids": torch.cat(
+                (
+                    torch.randint(0, 100, (batch_size, seq_len // 2)),
+                    chatglm_special_token,
+                    torch.randint(0, 100, (batch_size, seq_len // 2 - 2)),
+                ),
+                dim=1,
+            ),
+            "attention_mask": torch.randint(0, 2, (batch_size, 1, seq_len, seq_len)),
+        }
+    assert isinstance(actor, Actor)
+    get_base_model(actor)
+    actor_output = actor(**actor_input)
+    assert actor_output.logits.shape[:2] == (batch_size, seq_len)
+
+    if critic:
+        assert isinstance(critic, Critic)
+        get_base_model(critic)
+        critic_output = critic(**critic_input)
+        assert critic_output.shape == (batch_size,)
+
+    if rm:
+        assert isinstance(rm, RewardModel)
+        get_base_model(rm)
+        rm_output = rm(**rm_input)
+        assert rm_output.shape == (batch_size,)
+
+
+@pytest.mark.parametrize("batch_size", [16])
+@pytest.mark.parametrize("seq_len", [128])
+@pytest.mark.parametrize("num_labels", [100])
+def test_loss(batch_size: int, seq_len: int, num_labels: int):
+    loss = GPTLMLoss()
+    loss_input = {
+        "logits": torch.randn(batch_size, seq_len, num_labels),
+        "labels": torch.randint(0, num_labels, (batch_size, seq_len)),
+    }
+    loss(**loss_input)
+
+    loss = PolicyLoss()
+    loss_input = {
+        "log_probs": torch.randn(
+            batch_size,
+        ),
+        "old_log_probs": torch.randn(
+            batch_size,
+        ),
+        "advantages": torch.randn(
+            batch_size,
+        ),
+    }
+    loss(**loss_input)
+
+    loss = ValueLoss()
+    loss_input = {
+        "values": torch.randn(
+            batch_size,
+        ),
+        "old_values": torch.randn(
+            batch_size,
+        ),
+        "reward": torch.randn(
+            batch_size,
+        ),
+    }
+    loss(**loss_input)
+
+    loss = LogSigLoss()
+    loss_input = {
+        "chosen_reward": torch.randn(
+            batch_size,
+        ),
+        "reject_reward": torch.randn(
+            batch_size,
+        ),
+    }
+    loss(**loss_input)
+
+    loss = LogExpLoss()
+    loss_input = {
+        "chosen_reward": torch.randn(
+            batch_size,
+        ),
+        "reject_reward": torch.randn(
+            batch_size,
+        ),
+    }
+    loss(**loss_input)
+
+
+if __name__ == "__main__":
+    generate_kwargs = dict(max_length=40, use_cache=True, do_sample=True, temperature=1.0, top_k=50)
+    test_generation(lambda: LlamaActor(), batch_size=4, seq_len=32, generate_kwargs=generate_kwargs)
+
+    test_utils()
+
+    test_lora(lora_rank=2, num_dim=8, num_layers=2)
+
+    test_models(models_maker=lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()), batch_size=8, seq_len=128)
+
+    test_loss(batch_size=8, seq_len=128, num_labels=100)
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
+#!/usr/bin/env bash
+
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
+set -xu
+
+if [ -z "$SFT_DATASET" ]; then
+    echo "Please set \$SFT_DATASET to the path to sft dataset."
+    exit 1
+fi
+
+if [ -z "$PROMPT_DATASET" ]; then
+    echo "Please set \$PROMPT_DATASET to the path to prompts csv."
+    exit 1
+fi
+
+if [ -z "$PRETRAIN_DATASET" ]; then
+    echo "Please set \$PRETRAIN_DATASET to the path to alpaca data."
+    exit 1
+fi
+
+NUM_RETRY=3
+BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
+EXAMPLES_DIR=$BASE_DIR/examples
+MODELS_DIR=$BASE_DIR/examples/models_config
+MODELS=('gpt2' 'bloom' 'opt' 'llama')
+STRATEGIES=('ddp' 'colossalai_gemini' 'colossalai_zero2')
+
+
+export OMP_NUM_THREADS=8
+
+# install requirements
+pip install -r $EXAMPLES_DIR/requirements.txt
+
+python $EXAMPLES_DIR/download_model.py --model-dir $MODELS_DIR --config-only
+
+get_pretrain() {
+    local model=$1
+    if [[ $model == "gpt2" ]]; then
+        echo "gpt2"
+    elif [[ $model == "bloom" ]]; then
+        echo "bigscience/bloom-560m"
+    elif [[ $model == "opt" ]]; then
+        echo "facebook/opt-350m"
+    else
+        echo "Unknown model $model"
+        exit 1
+    fi
+}
+
+random_choice() {
+    local arr=("$@")
+    local len=${#arr[@]}
+    local idx=$((RANDOM % len))
+    echo ${arr[$idx]}
+}
+
+echo "[Test]: testing sft ..."
+
+# FIXME: This is a hack to skip tests that are not working
+#  - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
+#  - llama-*: These tests can be passed locally, skipped for long execution time
+#  - *-gemini: Gemini plugin does not support `from_pretrained` yet
+SKIPPED_TESTS=(
+    "gpt2-ddp"
+    "llama-ddp"
+    "llama-colossalai_gemini"
+    "llama-colossalai_zero2"
+)
+
+GRAD_CKPTS=('' '--grad_checkpoint')
+for lora_rank in '0'; do
+    for model in ${MODELS[@]}; do
+        strategies=($(shuf -e "${STRATEGIES[@]}"))
+        for strategy in ${strategies[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$strategy-$lora_rank"
+                continue
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy " ]]; then
+                echo "[Test]: Skipped $model-$strategy"
+                continue
+            fi
+            pretrain=$(get_pretrain $model)
+            pretrain_model=""
+            if [[ $lora_rank -gt 0 ]]; then
+                pretrain_model="--pretrain $pretrain"
+            fi
+            grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
+            for i in $(seq $NUM_RETRY); do
+                echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
+                torchrun --standalone --nproc_per_node=4 $EXAMPLES_DIR/train_sft.py \
+                    $pretrain_model --tokenizer $MODELS_DIR/$model \
+                    --model $model --strategy $strategy --lora_rank $lora_rank $grad_ckpt \
+                    --dataset $SFT_DATASET --max_datasets_size 8 \
+                    --max_epochs 1 --batch_size 1 --accumulation_steps 1 --lr 1e-8 \
+                    --save_path $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank}
+                passed=$?
+                if [ $passed -eq 0 ]; then
+                    break
+                fi
+            done
+            if [ $passed -ne 0 ]; then
+                echo "[Test]: Failed $model-$strategy-$lora_rank"
+                exit 1
+            fi
+        done
+    done
+done
+
+echo "[Test]: testing reward model ..."
+
+# FIXME: This is a hack to skip tests that are not working
+#  - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
+#  - llama-*: These tests can be passed locally, skipped for long execution time
+#  - *-gemini: Gemini plugin does not support `from_pretrained` yet
+SKIPPED_TESTS=(
+    "gpt2-ddp"
+    "llama-ddp"
+    "llama-colossalai_gemini"
+    "llama-colossalai_zero2"
+)
+
+LOSS_FNS=('log_sig' 'log_exp')
+DATASETS=('Anthropic/hh-rlhf' 'Dahoas/rm-static')
+for lora_rank in '0'; do
+    for model in ${MODELS[@]}; do
+        strategies=($(shuf -e "${STRATEGIES[@]}"))
+        for strategy in ${strategies[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$strategy-$lora_rank"
+                continue
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy " ]]; then
+                echo "[Test]: Skipped $model-$strategy"
+                continue
+            fi
+            pretrain=$(get_pretrain $model)
+            pretrain_model=""
+            if [[ $lora_rank -gt 0 ]]; then
+                pretrain_model="--pretrain $pretrain"
+            fi
+            loss_fn=$(random_choice "${LOSS_FNS[@]}")
+            dataset=$(random_choice "${DATASETS[@]}")
+            subset=$(if [[ $dataset == "Dahoas/rm-static" ]]; then echo "None"; else echo "harmless-base"; fi)
+            for i in $(seq $NUM_RETRY); do
+                echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
+                torchrun --standalone --nproc_per_node=4 $EXAMPLES_DIR/train_reward_model.py \
+                    $pretrain_model --tokenizer $MODELS_DIR/$model \
+                    --dataset $dataset --subset $subset --max_datasets_size 8 \
+                    --model $model --strategy $strategy --lora_rank $lora_rank \
+                    --loss_fn $loss_fn --batch_size 1 --lr 1e-8 \
+                    --save_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt
+                passed=$?
+                if [ $passed -eq 0 ]; then
+                    break
+                fi
+            done
+            if [ $passed -ne 0 ]; then
+                echo "[Test]: Failed to train reward model $model-$strategy-$lora_rank"
+                exit 1
+            fi
+        done
+    done
+done
+
+echo "[Test]: testing RLHF ..."
+
+# FIXME: This is a hack to skip tests that are not working
+#  - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
+#  - llama-*: These tests can be passed locally, skipped for long execution time
+#  - *-gemini: Gemini plugin does not support `from_pretrained` yet
+SKIPPED_TESTS=(
+    "gpt2-ddp"
+    "llama-ddp"
+    "llama-colossalai_gemini"
+    "llama-colossalai_zero2"
+)
+
+for model in ${MODELS[@]}; do
+    for lora_rank in '0'; do
+        strategies=($(shuf -e "${STRATEGIES[@]}"))
+        for strategy in ${strategies[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$strategy-$lora_rank"
+                continue
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy " ]]; then
+                echo "[Test]: Skipped $model-$strategy"
+                continue
+            fi
+            rm_pretrain=$(get_pretrain $model)
+            rm_pretrain_model=""
+            if [[ $lora_rank -gt 0 ]]; then
+                rm_pretrain_model="--rm_pretrain $rm_pretrain"
+            fi
+            for i in $(seq $NUM_RETRY); do
+                echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
+                torchrun --standalone --nproc_per_node=4 $EXAMPLES_DIR/train_prompts.py \
+                    --prompt_dataset $PROMPT_DATASET --pretrain_dataset $PRETRAIN_DATASET --max_datasets_size 32 \
+                    --strategy $strategy --model $model --tokenizer $MODELS_DIR/$model \
+                    --num_episodes 1 --num_collect_steps 1 --num_update_steps 1 --lr 1e-8 \
+                    --experience_batch_size 2 --train_batch_size 1 --lora_rank $lora_rank \
+                    --pretrain $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank} \
+                    $rm_pretrain_model --rm_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt \
+                    --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts
+                passed=$?
+                if [ $passed -eq 0 ]; then
+                    break
+                fi
+            done
+            if [ $passed -ne 0 ]; then
+                echo "[Test]: Failed to train RLHF $model-$strategy-$lora_rank"
+                exit 1
+            fi
+        done
+        rm -rf $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank}
+        rm $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt
+    done
+done
+rm -rf $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts
--- a/applications/Colossal-LLaMA-2/README.md
+++ b/applications/Colossal-LLaMA-2/README.md
+<div align="center">
+<h1>
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/colossal-llama-2/colossalllam2.jpg?raw=true" width=800/>
+</h1>
+</div>
+
+## Table of Contents
+- [News](#news)
+- [Colossal-LLaMA-2-7B](#colossal-llama-2-7b)
+    - [Performance Evaluation](#performance-evaluation)
+    - [Examples](#examples)
+    - [Training Logs](#training-logs)
+    - [Import from Transformers](#import-from-transformers)
+- [Usage](#usage)
+    - [Install](#install)
+    - [How to run](#how-to-run)
+- [Technical Insight](#technical-insights)
+    - [Data](#data)
+    - [Tokenizer](#tokenizer)
+    - [Training Strategy](#training-strategy)
+    - [Bridging Any Domain-specific Large Models](#bridging-any-domain-specific-large-models)
+- [Citations](#citations)
+
+## News
+* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
+[[blog]](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
+[[model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base)
+
+## Colossal-LLaMA-2-7B
+The [Colossal-AI](https://github.com/hpcaitech/ColossalAI) team has introduced the open-source model **Colossal-LLaMA-2-7B-base**. This model, a derivation of LLaMA-2, has undergone continual pre-training involving approximately 8.5 billion tokens over a duration of 15 hours with 64 A800 GPUs. At a cost of **less than $1,000**, you can achieve results **similar to those that cost millions of dollars to pretrain from scratch**. It is licensed under the LLaMA-2 license and [Apache 2.0 License](https://github.com/hpcaitech/ColossalAI/blob/main/LICENSE) **without any additional commercial use restrictions**. This solution can also be used to build models of specific domain knowledge or tasks.
+
+Colossal-LLaMA-2-7B-base is designed to accommodate both the Chinese and English languages, featuring an expansive context window spanning 4096 tokens. Remarkably, it has exhibited exceptional performance when benchmarked against models of equivalent scale in standard Chinese and English evaluation metrics, including C-Eval and MMLU, among others.
+
+❗️**Important notice**:
+* All training data used for this project is collected from well-known public dataset.
+* We do not use any testing data from the evaluation benchmarks for training.
+
+### Performance Evaluation
+We conducted comprehensive evaluation on 4 dataset and compare our Colossal-Llama-2-7b-base model with various models.
+
+* We use 5-shot for MMLU and calculate scores based on the logits of first predicted token.
+* We use 5-shot for CMMLU and calculate scores based on the logits of first predicted token.
+* We use 5-shot for AGIEval and only calculate scores for 4-choice questions using a combination metric of exact match and the logits of first predicted token. If any of the exact match or logits of first predicted token is correct, the model will get the score.
+* We use 0-shot for GAOKAO-Bench and only calculate scores for 4-choice questions based on the logits of first predicted token.
+The generation config for all dataset is greedy search.
+* We also provided CEval scores from its lastest leaderboard or the official repository of the model.
+
+|                                |  Backbone  | Tokens Consumed |  |         MMLU         |     CMMLU     | AGIEval | GAOKAO | CEval  |
+| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :-----: | :----: | :----: | :------------------------------: |
+|                                |           |        -        |                |        5-shot        |    5-shot     | 5-shot  | 0-shot | 5-shot |
+|          Baichuan-7B           |     -      |      1.2T       |             |    42.32 (42.30)     | 44.53 (44.02) |  38.72  | 36.74  | 42.80  |
+|       Baichuan-13B-Base        |     -      |      1.4T       |             |    50.51 (51.60)     | 55.73 (55.30) |  47.20  | 51.41  | 53.60  |
+|       Baichuan2-7B-Base        |     -      |      2.6T       |             |    46.97 (54.16)     | 57.67 (57.07) |  45.76  | 52.60  | 54.00  |
+|       Baichuan2-13B-Base       |     -      |      2.6T       |             |    54.84 (59.17)     | 62.62 (61.97) |  52.08  | 58.25  | 58.10  |
+|           ChatGLM-6B           |     -      |      1.0T       |             |    39.67 (40.63)     |   41.17 (-)   |  40.10  | 36.53  | 38.90  |
+|          ChatGLM2-6B           |     -      |      1.4T       |             |    44.74 (45.46)     |   49.40 (-)   |  46.36  | 45.49  | 51.70  |
+|          InternLM-7B           |     -      |      1.6T       |                |    46.70 (51.00)     |   52.00 (-)   |  44.77  | 61.64  | 52.80  |
+|            Qwen-7B (original)             |     -      |      2.2T       |             | 54.29 (56.70) | 56.03 (58.80) |  52.47  | 56.42  | 59.60  |
+|                                |            |                 |                 |                      |               |         |        |        |
+|           Llama-2-7B           |     -      |      2.0T       |             |    44.47 (45.30)     |   32.97 (-)   |  32.60  | 25.46  |   -    |
+| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |             |        37.43         |     29.92     |  32.00  | 27.57  |   -    |
+| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |                |        38.56         |     31.52     |  30.99  | 25.95  |   -    |
+| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |                |        33.86         |     34.69     |  34.52  | 25.18  |  34.2  |
+| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |             |        43.73         |     42.04     |  37.64  | 30.61  |   -    |
+|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |                |        48.41         |     38.31     |  38.45  | 27.72  |   -    |
+|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |             |        49.96         |     41.10     |  39.83  | 33.00  |   -    |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |            |        50.25         |     40.99     |  40.04  | 30.54  |   -    |
+|  |  |  |  |  |  |  |  |  |
+|    **Colossal-LLaMA-2-7b-base**    | Llama-2-7B |      **0.0085T**      |            |        53.06         |     49.89     |  51.48  | 58.82  |  50.2  |
+
+> The score in parentheses corresponds to the scores in the official repository of the model.
+>
+> We use zero-shot for ChatGLM models.
+>
+> Qwen-7B is now inaccessible in Hugging Face, we are using the latest version of it before it was made inaccessible. Only for dataset MMLU, the prompt would be "xxx Answer:"(remove the space after ":") and we calculate the logits over " A", " B", " C" and " D" for Qwen-7B. Qwen-7B tends to be much more deterministic than other models. For example, the logits over " A" can be `-inf` and softmax would be exact `0`.
+>
+> For other models and other dataset, we calculate logits over "A", "B", "C" and "D".
+
+❗️ More details of the evaluation methods and reproduction of the results, please refer to [ColossalEval](https://github.com/hpcaitech/ColossalAI/tree/main/applications/ColossalEval).
+
+### Examples
+| Question Type | Question | <center>Colossal-LLaMA-2-7b-base</center> |
+| :------: | :-----------------------: | :----------- |
+| Knowledge Quiz<br>(Tourism)  | 请问武汉的最佳的旅行季节是什么时候？ | 3-4月份到武汉赏樱，10月份到武汉看黄陂枫树。武汉的最佳的旅行季节是春秋，3-4月份到武汉赏樱，10月份到武汉看黄陂枫树。武汉，简称“汉”别称江城，是湖北省省会，中部六省唯一的副省级市，特大城市，中国中部地区的中心城市，全国重要的工业基地、科教基地和综合交通枢纽。 |
+| Knowledge Quiz<br>(Computer Science)  | 在计算机中，什么是RAM？与ROM有什么区别？  | RAM是随机存取内存，可以随时读写，但是断电之后会丢失数据。存储数据比较快，但是需要不断的进行通电供电。ROM是只读内存，只能读取数据，不能写入数据。断电之后，存储的数据也不会丢失，但是需要不断进行通电。两者的价格和性能上会有区别，而且在使用的时候，RAM需要不断的供电，而ROM断电数据不丢失。  |
+| Knowledge Quiz<br>(Literature)  | 古代文学与现代文学有哪些区别和共通之处？  | 古代文学与现代文学有三点区别，现代文学更加通俗易懂，具有可读性，而古代文学相对来说更加注重形式，缺乏可读性。现代文学和古代文学，同属于中华文学大花园中的一枝花蕾。但是它们由于历史的原因，有着不同的特点，具体表现在，现代文学有更丰富的描写方法，更强烈的主观色彩和更突出的抒情色彩，而古代文学的主要特征是在创作风格上的多样化。  |
+| Emotional Analysis  | 请判断这个句子的情绪是否正面：我迫不及待想见到她！！！”。  | 这一句是情绪是正向的。是在期待见面的喜悦情绪。  |
+| Translation  | 将以下句子翻译成英语。 我喜欢看电影和读书。  | I like watching movies and reading books.   |
+| Logical Reasoning  | All books have the same weight, 10 books weigh 5kg, what is the weight of 2 books?  | To find the weight of 2 books, we can use the following steps:<br><br>1. Determine the weight of 1 book.<br>2. Multiply the weight of 1 book by the number of books.<br><br>Step 1: Determine the weight of 1 book.<br>Since 10 books weigh 5kg, we can divide 5kg by 10 to find the weight of 1 book.<br>5kg ÷ 10 = 0.5kg<br><br>Step 2: Multiply the weight of 1 book by the number of books.<br>To find the weight of 2 books, we multiply the weight of 1 book by 2.<br>0.5kg × 2 = 1kg<br><br>So, the weight of 2 books is 1kg. |
+| Information Extraction  | The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence.  | The date mentioned in the sentence is October 20th to October 23rd, 2021.  |
+| Error Correction  | Please correct the syntax error and restructure the sentence: "Went to the store me and my friend."  | The given sentence "Went to the store me and my friend." is incorrect due to a syntax error and it lacks a subject. The corrected version with proper grammar and sentence structure would be: "My friend and I went to the store."  |
+
+❗️ More examples of question answering, please refer to [Colossal-LLaMA-2-7B-base Examples](docs/example.md).
+
+### Training Logs
+We also recorded the training logs for the experiment
+
+<p id="Colossal-LLaMA-2-Multi-stage-training" align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/colossal-llama-2/trainingLossBySteps.jpeg?raw=true" width=600/>
+</p>
+
+<p id="Colossal-LLaMA-2-Multi-stage-training" align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/colossal-llama-2/trainingLossByTokens.jpeg?raw=true" width=600/>
+</p>
+
+### Import from Transformers (Inference)
+To load Colossal-LLaMA-2-7B-base model using Transformers, use the following code:
+```Python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained("hpcai-tech/Colossal-LLaMA-2-7b-base", device_map="auto", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("hpcai-tech/Colossal-LLaMA-2-7b-base", trust_remote_code=True)
+input = "离离原上草，"
+inputs = tokenizer(input, return_tensors='pt')
+inputs = inputs.to('cuda:0')
+pred = model.generate(**inputs,
+                        max_new_tokens=256,
+                        do_sample=True,
+                        top_k=50,
+                        top_p=0.95,
+                        num_return_sequences=1)
+print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)[len(input):])
+```
+
+You can also download model weights from [🤗HuggingFace](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base).
+
+## Usage
+### Install
+
+#### 0. Pre-requisite
+1. This experiment was performed on 8 computing nodes with 64 A800 GPUs in total for LLaMA-2-7B (**about 1000 USD cost**). The nodes are connected with RDMA and GPUs within one node are fully connected with NVLink. The script was tested with CUDA 11.7, CUDA version requires 11.7 or higher. You can also complete it in about 5 days on a 8*A100/A800 server.
+
+2. PyTorch. The PyTorch version should be less than 2.0.0 and greater than 1.12.1.
+
+
+#### 1. Install required packages
+```
+cd Colossal-LLaMA-2
+pip install -r requirements.txt
+```
+#### 2. Install `xentropy`, `layer_norm` and `rotary`
+```bash
+git clone git@github.com:Dao-AILab/flash-attention.git
+# At the root folder
+cd csrc/xentropy && pip install .
+# At the root folder
+cd csrc/layer_norm && pip install .
+# At the root folder
+cd csrc/rotary && pip install .
+```
+
+### How to run
+
+#### 1. Init Tokenizer Preparation
+Initialize new tokenizer with additional Chinese tokens. Additional Chinese tokens are stored in `jsonl` format as follows:
+```json
+{"piece": "你好"}
+{"piece": "人工智能"}
+```
+Command to initialize new tokenizer:
+```bash
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python'
+python colossal_llama2/tokenizer/init_tokenizer.py \
+    --source_tokenizer_dir "<SOURCE_TOKENIZER_DIR>" \
+    --target_tokenizer_dir "<TARGET_TOKENIZER_DIR>" \
+    --expand_tokens_file "<NEW_TOKENS_FILE>.jsonl"
+```
+Here is details about CLI arguments:
+* Source tokenizer directory: `--source_tokenizer_dir`. Directory to the source tokenizer. It should at least contain three files: `special_tokens_map.json`, `tokenizer.model` and `tokenizer_config.json`.
+* Target tokenizer directory: `--target_tokenizer_dir`. Directory to the target tokenizer.
+* Tokens to be added: `--expand_tokens_file`. Additional tokens to be added to the tokenizer.
+
+#### 2. Init Model Preparation
+Initialize the new model checkpoint by calculating the mean values from the original model checkpoint.
+Command to initialize new model checkpoint:
+```bash
+python colossal_llama2/model/init_model.py \
+    --source_model_and_tokenizer_path "<SOURCE_MODEL_AND_TOKENIZER_DIR>" \
+    --target_tokenizer_path "<TARGET_TOKENIZER_DIR>" \
+    --target_model_path "<TARGET_MODEL_DIR>"
+```
+"<TARGET_MODEL_DIR>" can be the same as "<TARGET_TOKENIZER_DIR>".
+
+Here is details about CLI arguments:
+* Source model and tokenizer path: `--source_model_and_tokenizer_path`. Source folder contains both model and tokenizer, for example, LLaMA-2 model in Hugging Face format.
+* Target tokenizer path: `--target_tokenizer_path`. Path to the new tokenizer folder generated from previous step.
+* Target model path: `--target_model_path`. Path to save the new model in Hugging Face format.
+
+❗️**Important**: Once you initialize the new model checkpoint, copy your new tokenizer files (`special_tokens_map.json`, `tokenizer.model` and `tokenizer_config.json`) to your new model folder.
+
+#### 3. Data Preparation
+Raw data should be formatted as `jsonl` format. Each data point should have the following fields:
+* `source` (str, compulsory): This part is ignored when calculating loss. Default can be empty.
+* `target` (str, compulsory): Loss will be calculated.
+* `category` (str, compulsory): Tags for each data point.
+
+Examples:
+```JSON
+{"source": "", "target": "Lionel Andrés Messi(Spanish pronunciation: [ljoˈnel anˈdɾes ˈmesi] (i); born 24 June 1987), also known as Leo Messi, is an Argentine professional footballer who plays as a forward for and captains both Major League Soccer club Inter Miami and the Argentina national team.", "category": "sports"}
+{"source": "猜谜语：一身卷卷细毛，吃的青青野草，过了数九寒冬，无私献出白毛。（打一动物）", "target": "白羊", "category": "riddle"}
+```
+You are allowed to customize the category tags or use `unknown` to define the category.
+
+Command to convert jsonl dataset to arrow format:
+```
+python prepare_pretrain_dataset.py \
+    --data_input_dirs "<JOSNL_DIR_1>,<JOSNL_DIR_2>,<JOSNL_DIR_3>" \
+    --tokenizer_dir "<TOKENIZER_DIR>" \
+    --data_cache_dir "jsonl_to_arrow_cache" \
+    --data_jsonl_output_dir "spliced_tokenized_output_jsonl" \
+    --data_arrow_output_dir "spliced_tokenized_output_arrow" \
+    --max_length 4096 \
+    --num_spliced_dataset_bins 10
+```
+Here is details about CLI arguments:
+* Source data directory: `data_input_dirs`. Each `<JOSNL_DIR>` can have multiple file in `jsonl` format.
+* Tokenzier directory: `tokenizer_dir`. Path to the tokenizer in Hugging Face format.
+* Data cache directory: `data_cache_dir`. Directory to store Hugging Face data cache. Default case will create `cache` folder locally.
+* Output directory for jsonl format: `data_jsonl_output_dir`. Output directory to store converted dataset in jsonl format.
+* Output directory for arrow format: `data_arrow_output_dir`. Output directory to store converted dataset in arrow format, which can be used for training directly.
+* Max length: `max_length`. Max length of spliced samples. Default value is 4096.
+* Number of bins for each category: `num_spliced_dataset_bins`. Number of bins for each category, used for bucket-based training.
+
+#### 4. Command Line Arguments for Training
+You can use `colossalai run` to launch multi-nodes training:
+```bash
+colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \
+train.py --OTHER_CONFIGURATIONS
+```
+Here is a sample hostfile:
+```bash
+hostname1
+hostname2
+hostname3
+hostname4
+```
+Make sure master node can access all nodes (including itself) by ssh without password.
+
+Here is details about CLI arguments:
+* Pre-trained model path: `--pretrained`. Path to the pre-trained model in Hugging Face format.
+* Dataset path: `--dataset`. Path to the pre-tokenized dataset.
+* Booster plugin: `--plugin`. `gemini`, `gemini_auto`, `zero2`，`zero2_cpu` and `3d` are supported.For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins/).
+* Intermediate checkpoint to load: `--load_checkpoint`. Path to the intermediate checkpoint. Saved checkpoint contains the states for `lr_scheduler`, `optimizer`,`running_states.json` and `modelling`. If `load_checkpoint` points to the `modelling` folder, only the model weights will be loaded without any other states to support multi-stage training.
+* Save interval: `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000.
+* Checkpoint directory: `--save_dir`. The directoty path to save checkpoint and intermediate states. Intermediate states include `lr_scheduler`, `optimizer`,`running_states.json` and `modelling`.
+* Tensorboard directory: `--tensorboard_dir`. The path to save tensorboard logs.
+* Configuration file: `--config_file`. The path to save the configuration file.
+* Number of epochs: `--num_epochs`. Number of training epochs. The default value is 1.
+* Micro batch size: `--micro_batch_size`. Batch size per GPU. The default value is 1.
+* Learning rate: `--lr`. The default value is 3e-4.
+* Max length: `--max_length`. Max context length. The default value is 4096.
+* Mixed precision: `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
+* Gradient clipping: `--gradient_clipping`. The default value is 1.0.
+* Weight decay: `-w`, `--weight_decay`. The default value is 0.1.
+* Warmup steps: `-s`, `--warmup_steps`. The default value is calcuated by 0.025 warmup ratio.
+* Gradient checkpointing: `--use_grad_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size.
+* Flash attention: `--use_flash_attn`. If you want to use flash attention, you must install `flash-attn` and related packages. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention.
+* Freeze non-embedding parameters: `--freeze_non_embeds_params`. Freeze non-embedding parameters. It can be helpful to align embeddings after extending vocabulary size.
+* Tensor parallelism size: `--tp`. TP size for 3d Parallelism. The default value is 1.
+* Zero stage: `--zero`. Zero stage for 3d Parallelism. The default value is 1.
+
+#### 5. Running Command
+An [example bash](train.example.sh) is also provided for the experiment. Here is the steps to run the experiment:
+* Create your own hostfile: `cp hostfile.example hostfile`.
+* Create your own bash: `cp train.example.sh train.sh`.
+* Add your real host ip or host name into the `hostfile`.
+* Update global variables and parameters in your `train.sh`.
+* Run the experiment by `bash train.sh`
+
+Here is the details about global variables for each experiment:
+* `PROJECT_NAME`: Project name for each experiment.
+* `PARENT_SAVE_DIR`: Parent folder to save model checkpoint.
+* `PARENT_TENSORBOARD_DIR`: Parent folder to save tensorboard logs.
+* `PARENT_CONFIG_FILE`: Parent folder to save configuration for each experiment.
+* `PRETRAINED_MODEL_PATH`: Path to the local pre-trained model checkpoint.
+* `dataset`: Paths to all prepared data. Typically, it's a list of subfolders within the output path of prepare data, `--data_arrow_output_dir`, and if there are multiple subfolders, please list them all. e.g.,
+```python
+declare -a dataset=(
+    "<DIR_1>/part-00000"
+    "<DIR_1>/part-00001"
+    "<DIR_2>/part-00000"
+)
+```
+## Technical Insights
+In order to enhance LLaMA-2's capabilities for understanding and generating Chinese content, The [Colossal-AI](https://github.com/hpcaitech/ColossalAI) team proposes the continuation of pre-training the LLaMA-2 model using both Chinese and English corpora. The overall pipeline can be described as follows:
+
+<p id="Colossal-LLaMA-2-pipeline" align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/colossal-llama-2/Colossal-LLaMA-2-pipeline.jpeg?raw=true" width=800/>
+</p>
+
+### Data
+Large language models such as LLaMA-2 have undergone training using a heterogeneous blend of high-quality datasets, yielding promising outcomes. Enhancing LLaMA-2's performance for the Chinese corpus, while preserving its proficiency in English, critically hinges on two pivotal factors: the composition of the dataset, which encompasses both English and Chinese content, and the quality of each constituent dataset.
+
+The following figure shows the data processing pipeline conducted for Colossal-LLaMA-2.
+<p id="Colossal-LLaMA-2-data-processing-pipeline" align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/colossal-llama-2/data_processing_pipeline.jpeg?raw=true" width=800/>
+</p>
+
+❗️**Important**: We will open-source our data-processing toolkit soon, stay tuned!
+
+### Tokenizer
+The original LLaMA-2 vacabulary comprises fewer than a thousand Chinese characters, thus proves inadequate for encoding comprehensive Chinese texts effectively. Secondly, the utilization of byte tokens presents a challenge for transformer encoders to capture the semantic nuances of Chinese characters.
+
+To address the above issues, we extend LLaMA-2 vocabulary from 32,000 to 69,104. To adapt the LLaMA-2 model for use with the Colossal-LLaMA-2 tokenizer, we initialize the new word embeddings by calculating the mean values from the original LLaMA-2 embeddings and subsequently append these new rows to the end of the original embedding matrices.
+
+Advantages of extending vocabulary size:
+* Improve the compression rate of string sequence encoding.
+* Enhance the integrity of information.
+* Enable encoded sequences to contain more valuable information, thereby theoretically enhancing the ability for chapter-level encoding.
+
+Advantages of large vocabulary size under low-resource settings:
+* The presence of numerous unused tokens can be attributed to the limited training dataset, where an excessive number of tokens might not have been effectively learned.
+* Excessive vocabulary expansion leads to an increase in embedding-related parameters, resulting in higher memory usage, which, in turn, affects the efficiency of the training process.
+
+To balance both sides, we finally construct our vocabulary with size 69,104. The following table below presents a comparison of various models at the 7B level.
+
+| Model | Vocabulary Size | Compression Rate | Average Length of Samples (token-level) |
+| :-----------: | :---------: | :----: | :----: |
+| Colossal-LLaMA-2 | 69104 | 0.659 | 73.682 |
+| LLaMA-2-7B | 32000 | 1.205 | 134.689 |
+| Atom-7B | 65000 | 0.634 | 70.915 |
+| Baichuan-7B | 64000 | 0.678 | 75.857 |
+| Baichuan2-7B-base | 125696 | 0.570 | 63.761 |
+| Chatglm2-6B | 64789 | 0.645 | 72.178 |
+| InternLM-7B | 103168 | 0.566 | 63.349 |
+| Qwen-7B | 151643 | 0.578 | 64.703 |
+| Tigerbot-7B-base | 60515 | 0.630 | 70.515 |
+| Yayi-7B-llama2 | 32005 | 1.214 | 135.689 |
+| Chinese-llama-2-7b | 55296 | 0.668 | 74.690 |
+| Chinese-Falcon-7B | 90046 | 0.669 | 74.858 |
+| LinkSoul-Chinese-Llama-2-7b | 40076 | 0.958 | 107.089 |
+| Ziya-LLaMA-13B-v1.1 | 39410 | 0.958 | 107.074 |
+
+
+### Training Strategy
+#### Multi-stage Training
+In order to enhance the model's performance and harness the full potential of the original LLaMA-2, we have developed a multi-stage training strategy. This strategy is designed to systematically unlock the model's capabilities over a series of stages.
+
+Therefore, we have divided the training process into three stages:
+* Large-scale pre-training stage (Conducted by LLaMA-2): This initial stage is aimed at establishing the model's foundational capabilities from the ground up. It necessitates the use of a substantial dataset comprising no less than 1 trillion tokens.
+* Chinese knowledge injection stage: In this stage, we introduce Chinese knowledge into the model. It requires access to a high-quality dataset rich in comprehensive knowledge relevant to the Chinese language.
+* Knowledge replay stage: Knowledge is replayed through a question-answering (QA) mechanism, encompassing both the Chinese and English domains.
+
+Following the completion of this multi-stage training process, the model exhibits notable improvements in performance across both English and Chinese benchmarks.
+
+The following figure illustrates the three stages for training Colossal-LLaMA-2.
+
+<p id="Colossal-LLaMA-2-Multi-stage-training" align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/colossal-llama-2/multi-stage-training.png?raw=true" width=600/>
+</p>
+
+#### Bucket-based Training
+Our experiments have revealed that the distributions within the training dataset, as well as the arrangement of various topic-related data points, significantly impact the overall performance of the model, particularly in the context of continual pre-training of LLaMA-2.
+
+In an effort to achieve a more balanced distribution and exert control over the dataset's ordering, we have adopted a method where we divide each sub-dataset into discrete bins. These bins are then combined to construct individual data buckets, with one bin contributed by each sub-dataset.
+
+### Bridging Any Domain-specific Large Models
+Applying the above process to perform knowledge transfer in any field allows for the cost-effective construction of lightweight domain-specific foundational large models.
+
+<p id="domain_specific-llm" align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/colossal-llama-2/domain_specific-llm.jpeg?raw=true" width=800/>
+</p>
+
+## Citations
+```bibtex
+@article{bian2021colossal,
+    title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
+    author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
+    journal={arXiv preprint arXiv:2110.14883},
+    year={2021}
+}
+```
+```bibtex
+@misc{touvron2023llama,
+    title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
+    author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
+    year={2023},
+    eprint={2307.09288},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+```bibtex
+@article{dao2023flashattention2,
+    title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
+    author={Dao, Tri},
+    year={2023}
+}
+}
+```
--- a/applications/Colossal-LLaMA-2/colossal_llama2/__init__.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/__init__.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
--- a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/__init__.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/__init__.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
--- a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import os
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Union, Sequence, Optional, Iterator, Callable
+
+import torch
+from datasets import dataset_dict, load_from_disk
+from datasets import Dataset as HFDataset
+from torch.distributed import ProcessGroup
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.utils.data import ConcatDataset, Dataset, DataLoader, DistributedSampler
+from transformers.tokenization_utils import PreTrainedTokenizer
+import torch.nn.functional as F
+
+DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
+PathType = Union[str, os.PathLike]
+
+
+def load_tokenized_dataset(
+    dataset_paths: Union[PathType, List[PathType]], mode: str = "train"
+) -> Optional[DatasetType]:
+    """
+    Load pre-tokenized dataset.
+    Each instance of dataset is a dictionary with
+    `{'input_ids': List[int], 'labels': List[int], sequence: str}` format.
+    """
+    mode_map = {"train": "train", "dev": "validation", "test": "test"}
+    assert mode in tuple(mode_map), f"Unsupported mode {mode}, it must be in {tuple(mode_map)}"
+
+    if isinstance(dataset_paths, (str, os.PathLike)):
+        dataset_paths = [dataset_paths]
+
+    datasets = []  # `List[datasets.dataset_dict.Dataset]`
+    for ds_path in dataset_paths:
+        ds_path = os.path.abspath(ds_path)
+        assert os.path.exists(ds_path), f"Not existed file path {ds_path}"
+        ds_dict = load_from_disk(dataset_path=ds_path, keep_in_memory=False)
+        if isinstance(ds_dict, HFDataset):
+            datasets.append(ds_dict)
+        else:
+            if mode_map[mode] in ds_dict:
+                datasets.append(ds_dict[mode_map[mode]])
+    if len(datasets) == 0:
+        return None
+    if len(datasets) == 1:
+        return datasets.pop()
+    return ConcatDataset(datasets=datasets)
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """
+    Collate instances for supervised dataset.
+    Each instance is a tokenized dictionary with fields
+    `input_ids`(List[int]), `labels`(List[int]) and `sequence`(str).
+    """
+
+    tokenizer: PreTrainedTokenizer
+    max_length: int = 4096
+    ignore_index: int = -100
+
+    def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
+        """
+
+        Args:
+            instances (`Sequence[Dict[str, List[int]]]`):
+                Mini-batch samples, each sample is stored in an individual dictionary.
+
+        Returns:
+            (`Dict[str, torch.Tensor]`): Contains the following `torch.Tensor`:
+                `input_ids`: `torch.Tensor` of shape (bsz, max_len);
+                `attention_mask`: `torch.BoolTensor` of shape (bsz, max_len);
+                `labels`: `torch.Tensor` of shape (bsz, max_len), which contains `IGNORE_INDEX`.
+        """
+        assert isinstance(self.tokenizer.pad_token_id, int) and self.tokenizer.pad_token_id >= 0, (
+            f"`{self.tokenizer.__class__.__name__}.pad_token_id` must be a valid non-negative integer index value, "
+            f"but now `{self.tokenizer.pad_token_id}`"
+        )
+
+        # `List[torch.Tensor]`
+        batch_input_ids = [
+            torch.LongTensor(instance["input_ids"][: self.max_length])
+            if len(instance["input_ids"]) > self.max_length
+            else torch.LongTensor(instance["input_ids"])
+            for instance in instances
+        ]
+        batch_labels = [
+            torch.LongTensor(instance["labels"][: self.max_length])
+            if len(instance["labels"]) > self.max_length
+            else torch.LongTensor(instance["labels"])
+            for instance in instances
+        ]
+
+        if self.tokenizer.padding_side == "right":
+            input_ids = torch.nn.utils.rnn.pad_sequence(
+                sequences=batch_input_ids,
+                batch_first=True,
+                padding_value=self.tokenizer.pad_token_id,
+            )  # (bsz, max_len)
+            labels = torch.nn.utils.rnn.pad_sequence(
+                sequences=batch_labels,
+                batch_first=True,
+                padding_value=self.ignore_index,
+            )  # (bsz, max_len)
+            # pad to max
+            to_pad = self.max_length - input_ids.size(1)
+            input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
+            labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
+        elif self.tokenizer.padding_side == "left":
+            reversed_input_ids = [seq.flip(dims=(0,)) for seq in batch_input_ids]
+            reversed_input_ids = torch.nn.utils.rnn.pad_sequence(
+                sequences=reversed_input_ids,
+                batch_first=True,
+                padding_value=self.tokenizer.pad_token_id,
+            )  # (bsz, max_len)
+            input_ids = torch.flip(reversed_input_ids, dims=(1,))  # (bsz, max_len)
+            reversed_labels = [seq.flip(dims=(0,)) for seq in batch_labels]
+            reversed_labels = torch.nn.utils.rnn.pad_sequence(
+                sequences=reversed_labels,
+                batch_first=True,
+                padding_value=self.ignore_index,
+            )  # (bsz, max_len)
+            labels = torch.flip(reversed_labels, dims=(1,))  # (bsz, max_len)
+        else:
+            raise RuntimeError(
+                f"`{self.tokenizer.__class__.__name__}.padding_side` can only be `left` or `right`, "
+                f"but now `{self.tokenizer.padding_side}`"
+            )
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)  # `torch.BoolTensor`, (bsz, max_len)
+
+        return dict(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+
+
+class StatefulDistributedSampler(DistributedSampler):
+    """
+    Stateful distributed sampler for multi-stage training.
+    """
+
+    def __init__(
+        self,
+        dataset: DatasetType,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+    ) -> None:
+        super().__init__(
+            dataset=dataset,
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=drop_last,
+        )
+        self.start_index = 0
+
+    def __iter__(self) -> Iterator:
+        iterator = super().__iter__()
+        indices = list(iterator)
+        indices = indices[self.start_index :]
+        return iter(indices)
+
+    def __len__(self) -> int:
+        return self.num_samples - self.start_index
+
+    def set_start_index(self, start_index: int) -> None:
+        self.start_index = start_index
+
+
+def setup_distributed_dataloader(
+    dataset: DatasetType,
+    batch_size: int = 1,
+    shuffle: bool = False,
+    seed: int = 1024,
+    drop_last: bool = False,
+    pin_memory: bool = False,
+    num_workers: int = 0,
+    collate_fn: Callable[[Sequence[Dict[str, Union[str, List[int]]]]], Dict[str, torch.Tensor]] = None,
+    process_group: Optional[ProcessGroup] = None,
+    **kwargs,
+) -> DataLoader:
+    """
+    Setup dataloader for distributed training.
+    """
+    _kwargs = kwargs.copy()
+    process_group = process_group or _get_default_group()
+    sampler = StatefulDistributedSampler(
+        dataset=dataset,
+        num_replicas=process_group.size(),
+        rank=process_group.rank(),
+        shuffle=shuffle,
+        seed=seed,
+        drop_last=drop_last,
+    )
+
+    # Deterministic dataloader
+    def seed_worker(worker_id: int) -> None:
+        worker_seed = seed
+        np.random.seed(worker_seed)
+        torch.manual_seed(worker_seed)
+        random.seed(worker_seed)
+
+    return DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=collate_fn,
+        pin_memory=pin_memory,
+        drop_last=drop_last,
+        worker_init_fn=seed_worker,
+        **_kwargs,
+    )
--- a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Splicing multiple pre-tokenized sequence data points
+"""
+
+import random
+import warnings
+from copy import deepcopy
+from datasets import dataset_dict
+from typing import Any, Callable, Dict, Iterable, List, Union, Tuple
+
+from torch.utils.data import ConcatDataset, Dataset, IterableDataset
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+IGNORE_INDEX = -100
+
+DSType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
+
+
+def supervised_tokenize(
+    data_point: Dict[str, str], tokenizer: LlamaTokenizer, ignore_index: int = None, max_length: int = 4096
+) -> Dict[str, Union[int, str, List[int]]]:
+    """
+    A tokenization function to tokenize an original pretraining data point as following:
+        {"source": "", "target": "Beijing, the capital of the People's Republic of China, ...", "category": "geography"}
+    """
+    assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
+        "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
+        "add <bos> and <eos> manually later"
+    )
+    if ignore_index is None:
+        ignore_index = IGNORE_INDEX
+
+    source_text = data_point["source"]  # `str`
+    target_text = data_point["target"]  # `str`
+    is_null_source = len(source_text) == 0
+
+    source_text = tokenizer.bos_token + source_text
+    target_text += tokenizer.eos_token
+    sequence_text = source_text + target_text
+
+    tokenized = tokenizer([source_text, sequence_text])["input_ids"]
+    sequence_input_ids = tokenized[1]
+    sequence_labels = deepcopy(sequence_input_ids)
+
+    source_length = len(tokenized[0])
+    if not is_null_source:
+        sequence_labels[:source_length] = [ignore_index for _ in range(source_length)]
+
+    # sequence truncation.
+    if len(sequence_input_ids) > max_length:
+        sequence_input_ids = sequence_input_ids[:max_length]
+        sequence_labels = sequence_labels[:max_length]
+
+    return dict(
+        input_ids=sequence_input_ids,
+        labels=sequence_labels,
+        seq_length=len(sequence_input_ids),
+        seq_category=data_point["category"],
+    )
+
+
+class ClosedToConstantLengthSplicedDataset(IterableDataset):
+    """
+    Define an iterable dataset that returns a (close to) constant length data point spliced from multiple
+    original independent (pre-tokenized) data points.
+    """
+
+    def __init__(
+        self,
+        dataset: DSType,
+        tokenizer: PreTrainedTokenizer,
+        max_length: int = 4096,
+        num_packed_sequences: int = 8,
+        fetch_sequence_func: Callable[[Any], Tuple[List[int], List[int]]] = None,
+        input_ids_field: str = "input_ids",
+        labels_field: str = "labels",
+        infinite: bool = False,
+        shuffle: bool = True,
+        error_strict: bool = False,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.dataset = dataset
+        self.max_length = max_length
+        self.infinite = infinite
+        self.max_buffer_size = max_length * num_packed_sequences  # e.g., 4096 * 16
+        self.shuffle = shuffle
+
+        # Callable[[Dict[str, Any]], Tuple[List[int], List[int]]],
+        # A function that fetch sequence input_ids and labels from the original data point
+        if fetch_sequence_func is None:
+            self.fetch_sequence_func = lambda data_point: (data_point[input_ids_field], data_point[labels_field])
+        else:
+            self.fetch_sequence_func = fetch_sequence_func
+        self.input_ids_field = input_ids_field
+        self.labels_field = labels_field
+
+        self.error_strict = error_strict
+        self.current_size = 0  # `int`, current packed data size.
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def __iter__(self) -> Iterable[Dict[str, List[int]]]:
+        iterator = iter(self.dataset)
+        more_data_points = True
+        while more_data_points is True:
+            buffer, buffer_len = [], 0
+            while True:
+                # ending condition.
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    # `Tuple[List[int], List[int]]`
+                    seq_input_ids, seq_labels = self.fetch_sequence_func(next(iterator))
+                    buffer.append({self.input_ids_field: seq_input_ids, self.labels_field: seq_labels})
+                    buffer_len += len(buffer[-1][self.input_ids_field])
+                except StopIteration:
+                    if self.infinite is True:
+                        iterator = iter(self.dataset)
+                        warnings.warn("The dataset reached end and the iterator is reset to the start.")
+                    else:
+                        more_data_points = False
+                        break
+            examples = []  # `List[Dict[str, List[int]]]`, save buffered spliced data points.
+            spliced_input_ids, spliced_labels = [], []  # `List[int]`, `List[int]`
+            for i, data_point in enumerate(buffer):
+                # TODO(2023-09-18) check errors for each unspliced tokenized data point
+                seq_input_ids = data_point[self.input_ids_field]
+                seq_labels = data_point[self.labels_field]
+                # Handle special case:
+                # If the length of an original data point (i.e., input_ids length of a data point before splicing)
+                # exceeds `max_length`, truncate it.
+                if len(seq_input_ids) > self.max_length:
+                    truncated_seq_input_ids = seq_input_ids[: self.max_length]
+                    truncated_label_ids = seq_labels[: self.max_length]
+                    if set(truncated_label_ids) == {IGNORE_INDEX}:
+                        if self.error_strict is True:
+                            raise ValueError(
+                                f"Find an out-of-bounds length({len(seq_input_ids)}) data point "
+                                f"with all label values as {IGNORE_INDEX}."
+                            )
+                        else:
+                            warnings.warn(f"Filter an error truncated data point (labels all {IGNORE_INDEX})")
+                            continue  # Skip the current error data point.
+                    spliced_data_point = {
+                        self.input_ids_field: truncated_seq_input_ids,
+                        self.labels_field: truncated_label_ids,
+                    }
+                    examples.append(spliced_data_point)
+                    warnings.warn("Find a data point to be truncated.")
+                    continue
+
+                # Pre action judgment.
+                if len(spliced_input_ids) + len(seq_input_ids) > self.max_length:
+                    spliced_data_point = {
+                        self.input_ids_field: spliced_input_ids,
+                        self.labels_field: spliced_labels,
+                    }  # `Dict[str, List[int]]`
+                    # Update.
+                    spliced_input_ids, spliced_labels = [], []
+                    spliced_input_ids.extend(seq_input_ids)
+                    spliced_labels.extend(seq_labels)
+                    examples.append(spliced_data_point)
+                else:
+                    spliced_input_ids.extend(seq_input_ids)
+                    spliced_labels.extend(seq_labels)
+            # For residual spliced data point at the end of the data set
+            if self.infinite is False and more_data_points is False and len(spliced_input_ids) > 0:
+                examples.append(
+                    {
+                        self.input_ids_field: spliced_input_ids,
+                        self.labels_field: spliced_labels
+                    }
+                )
+            if self.shuffle:
+                random.shuffle(examples)
+            for spliced_data_point in examples:
+                # TODO(2023-09-18): check errors for each spliced tokenized data point.
+                self.current_size += 1
+                yield spliced_data_point
--- a/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Initialize new model with updated tokenizer by calculating the mean values from original model
+"""
+import argparse
+
+import numpy as np
+import torch
+from transformers import LlamaTokenizer, LlamaForCausalLM
+
+from colossalai.logging import get_dist_logger
+
+
+logger = get_dist_logger()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--source_model_and_tokenizer_path",
+        type=str,
+        required=True,
+        default=None,
+        help="Source path of model & tokenizer",
+    )
+    parser.add_argument("--target_tokenizer_path", type=str, required=True, default=None, help="Target tokenizer path")
+    parser.add_argument("--target_model_path", type=str, required=True, default=None, help="Target model path")
+    args = parser.parse_args()
+
+    source_tokenizer = LlamaTokenizer.from_pretrained(args.source_model_and_tokenizer_path)
+    source_tokenizer.add_bos_token = False
+    source_tokenizer.add_eos_token = False
+    if source_tokenizer.pad_token is None:
+        source_tokenizer.pad_token = source_tokenizer.unk_token
+    source_vocab = source_tokenizer.get_vocab()
+
+    target_tokenizer = LlamaTokenizer.from_pretrained(args.target_tokenizer_path)
+    target_tokenizer.add_bos_token = False
+    target_tokenizer.add_eos_token = False
+    if target_tokenizer.pad_token is None:
+        target_tokenizer.pad_token = target_tokenizer.unk_token
+    target_vocab = target_tokenizer.get_vocab()
+    target_inverted_vocab = {v: k for k, v in target_vocab.items()}
+
+    assert len(target_vocab) > len(
+        source_vocab
+    ), f"Target vocab size({len(target_vocab)}) must be greater than source vocab size({len(source_vocab)})"
+
+    gpu_device = torch.device("cuda:0")
+    cpu_device = torch.device("cpu")
+
+    source_model = LlamaForCausalLM.from_pretrained(args.source_model_and_tokenizer_path)
+    source_model.eval()
+    source_model = source_model.to(gpu_device)
+
+    source_input_embeddings = source_model.get_input_embeddings()
+    assert isinstance(source_input_embeddings, torch.nn.Embedding)
+    assert source_input_embeddings.weight.shape[0] == len(source_vocab)
+    source_input_embeddings.eval()
+
+    source_output_embeddings = source_model.get_output_embeddings()
+    assert isinstance(source_output_embeddings, torch.nn.Linear)
+    assert source_output_embeddings.bias is None
+    assert source_output_embeddings.weight.shape[0] == len(source_vocab)
+    source_output_embeddings.eval()
+
+    input_embeddings = source_input_embeddings.weight.cpu().detach().numpy()
+    output_embeddings = source_output_embeddings.weight.cpu().detach().numpy()
+    for i in range(len(source_vocab), len(target_vocab)):
+        if i % 500 == 0:
+            logger.info(f"processing {i}/{len(target_vocab)} target tokens")
+        target_token = target_inverted_vocab[i]
+        target_to_source_token_ids = torch.LongTensor(source_tokenizer([target_token])["input_ids"][0])
+        target_to_source_token_ids = target_to_source_token_ids.to(gpu_device)
+
+        target_to_source_input_embedding = (
+            source_input_embeddings.weight[target_to_source_token_ids]
+            .mean(dim=0)
+            .unsqueeze(dim=0)
+            .cpu()
+            .detach()
+            .numpy()
+        )
+        target_to_source_output_embedding = (
+            source_output_embeddings.weight[target_to_source_token_ids]
+            .mean(dim=0)
+            .unsqueeze(dim=0)
+            .cpu()
+            .detach()
+            .numpy()
+        )
+
+        input_embeddings = np.concatenate((input_embeddings, target_to_source_input_embedding), axis=0)
+        output_embeddings = np.concatenate((output_embeddings, target_to_source_output_embedding), axis=0)
+
+    source_model = source_model.to(cpu_device)
+    assert isinstance(source_model, LlamaForCausalLM)
+
+    # expand
+    source_model.resize_token_embeddings(new_num_tokens=len(target_vocab))
+    source_model.model.embed_tokens.weight.data = torch.Tensor(input_embeddings)
+    source_model.lm_head.weight.data = torch.Tensor(output_embeddings)
+
+    source_model = source_model.half()
+    source_model.save_pretrained(save_directory=args.target_model_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+Initialize new tokenizer for continual pre-training
+"""
+
+import argparse
+import os
+import json
+from typing import List, Union
+
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
+from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
+
+from colossalai.logging import get_dist_logger
+
+os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+
+logger = get_dist_logger()
+
+
+def expand_vocab_tokenizer(
+    source_tokenizer_dir: Union[str, os.PathLike], target_tokenizer_dir: Union[str, os.PathLike], new_tokens: List[str]
+) -> None:
+    """Expand tokenizer for continue pre-training."""
+    if os.path.exists(target_tokenizer_dir):
+        raise RuntimeError(f"Find existed directory {target_tokenizer_dir}")
+
+    source_tokenizer = LlamaTokenizer.from_pretrained(source_tokenizer_dir)
+    logger.info(source_tokenizer)
+    source_sp_processor = source_tokenizer.sp_model
+    source_spm = sp_pb2_model.ModelProto()
+    source_spm.ParseFromString(source_sp_processor.serialized_model_proto())
+
+    logger.info(f"Source tokenizer size: {len(source_sp_processor)}")
+
+    # Add new tokens to source tokenizer.
+    source_spm_tokens = set([p.piece for p in source_spm.pieces])
+    for piece in new_tokens:
+        assert isinstance(piece, str), f"Invalid token({piece}) type {type(piece)}"
+        if piece in source_spm_tokens:
+            # Skip existed token.
+            continue
+        new_p = sp_pb2_model.ModelProto().SentencePiece()
+        new_p.piece = piece
+        new_p.score = 0
+        source_spm.pieces.append(new_p)
+    logger.info(f"Expand vocab from {len(source_spm_tokens)} to {len(source_spm.pieces)}")
+
+    # Save
+    os.makedirs(target_tokenizer_dir)
+    target_tokenizer_model_path = os.path.join(target_tokenizer_dir, "tokenizer.model")
+    with open(file=target_tokenizer_model_path, mode="wb") as fp:
+        fp.write(source_spm.SerializeToString())
+
+    target_tokenizer = LlamaTokenizer(vocab_file=target_tokenizer_model_path)
+    target_tokenizer.save_pretrained(save_directory=target_tokenizer_dir)
+    logger.info(f"Successfully save expand tokenizer to {target_tokenizer_dir}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--source_tokenizer_dir", type=str, required=True, default=None, help="Source tokenizer directory"
+    )
+    parser.add_argument(
+        "--target_tokenizer_dir", type=str, required=True, default=None, help="Target tokenizer directory"
+    )
+    parser.add_argument(
+        "--expand_tokens_file",
+        type=str,
+        required=True,
+        default=None,
+        help="Path of the file containing tokens to be extended",
+    )
+    args = parser.parse_args()
+
+    expand_tokens = []
+    with open(file=args.expand_tokens_file, mode="r", encoding="utf-8") as fp_reader:
+        for line in fp_reader:
+            item = json.loads(line)
+            # e.g., {"piece": "你好"}
+            token = item["piece"]
+            if token in expand_tokens:
+                continue
+            expand_tokens.append(token)
+    expand_tokens.sort(key=lambda t: len(t), reverse=False)
+
+    expand_vocab_tokenizer(
+        source_tokenizer_dir=args.source_tokenizer_dir,
+        target_tokenizer_dir=args.target_tokenizer_dir,
+        new_tokens=expand_tokens,
+    )
+
+
+if __name__ == "__main__":
+    main()