Merge branch 'main' of https://github.com/hpcaitech/ColossalAI

7bc5a8e3 · zhuwenwen · e6748d82 · 0f785cb1 · 7bc5a8e3 · 7bc5a8e3
Commit 7bc5a8e3 authored May 05, 2023 by zhuwenwen
20 changed files
--- a/applications/Chat/examples/community/ray/train_prompts_on_ray.py
+++ b/applications/Chat/examples/community/ray/train_prompts_on_ray.py
+import argparse
+import logging
+import os
+import socket
+from copy import deepcopy
+from typing import Type
+
+import ray
+import torch
+from coati.experience_maker.base import Experience
+from coati.models.base import RewardModel
+from coati.models.bloom import BLOOMActor, BLOOMCritic
+from coati.models.gpt import GPTActor, GPTCritic
+from coati.models.lora import LoRAModule
+from coati.models.loss import PolicyLoss, ValueLoss
+from coati.models.opt import OPTActor, OPTCritic
+from coati.models.utils import compute_reward
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from torch.optim import Adam
+from transformers import AutoTokenizer, BloomTokenizerFast
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+
+
+class ExperienceCompositionRefs:
+
+    def __init__(self, sequences_attention_mask_action_mask_ref: ray.ObjectRef, action_log_probs_ref: ray.ObjectRef,
+                 base_action_log_probs_ref: ray.ObjectRef, value_ref: ray.ObjectRef, r_ref: ray.ObjectRef) -> None:
+        self.sequences_attention_mask_action_mask_ref = sequences_attention_mask_action_mask_ref
+        self.action_log_probs_ref = action_log_probs_ref
+        self.base_action_log_probs_ref = base_action_log_probs_ref
+        self.value_ref = value_ref
+        self.r_ref = r_ref
+
+
+class ExperienceMaker:
+
+    def __init__(self, kl_coef) -> None:
+        self.kl_coef = kl_coef
+
+    @torch.no_grad()
+    def make_experience(self, experiment_computation_refs: ExperienceCompositionRefs):
+        sequences, attention_mask, action_mask = ray.get(
+            experiment_computation_refs.sequences_attention_mask_action_mask_ref)
+        action_log_probs = ray.get(experiment_computation_refs.action_log_probs_ref)
+        base_action_log_probs = ray.get(experiment_computation_refs.base_action_log_probs_ref)
+        r = ray.get(experiment_computation_refs.r_ref)
+        reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
+        value = ray.get(experiment_computation_refs.value_ref)
+        advantage = reward - value
+        if advantage.ndim == 1:
+            advantage = advantage.unsqueeze(-1)
+        experience = Experience(sequences, action_log_probs, value, reward, advantage, attention_mask, action_mask)
+        return experience
+
+
+class DistributedTorchRayActor:
+
+    def __init__(self, world_size, rank, local_rank, master_addr, master_port):
+        logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
+                            level=logging.INFO,
+                            datefmt='%Y-%m-%d %H:%M:%S')
+        self._model = None
+        self._world_size = world_size
+        self._rank = rank
+        self._local_rank = local_rank
+        self._master_addr = master_addr if master_addr else self._get_current_node_ip()
+        self._master_port = master_port if master_port else self._get_free_port()
+        os.environ["MASTER_ADDR"] = self._master_addr
+        os.environ["MASTER_PORT"] = str(self._master_port)
+        os.environ["WORLD_SIZE"] = str(self._world_size)
+        os.environ["RANK"] = str(self._rank)
+        os.environ["LOCAL_RANK"] = str(self._local_rank)
+
+    @staticmethod
+    def _get_current_node_ip():
+        return ray._private.services.get_node_ip_address()
+
+    @staticmethod
+    def _get_free_port():
+        with socket.socket() as sock:
+            sock.bind(('', 0))
+            return sock.getsockname()[1]
+
+    def get_master_addr_port(self):
+        return self._master_addr, self._master_port
+
+
+class BasePPORole(DistributedTorchRayActor):
+
+    def add_experience_maker(self, kl_coef: float = 0.1):
+        self._experience_maker = ExperienceMaker(kl_coef)
+
+    def make_experience(self, experience_computation_ref: ExperienceCompositionRefs):
+        return self._experience_maker.make_experience(experience_computation_ref)
+
+    def _init_strategy(self, strategy: str):
+        # configure strategy
+        if strategy == 'naive':
+            self._strategy = NaiveStrategy()
+        elif strategy == 'ddp':
+            self._strategy = DDPStrategy()
+        elif strategy == 'colossalai_gemini':
+            self._strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+        elif strategy == 'colossalai_zero2':
+            self._strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+        else:
+            raise ValueError(f'Unsupported strategy "{strategy}"')
+
+    def _init_optimizer(self):
+        if isinstance(self._strategy, ColossalAIStrategy):
+            self._optimizer = HybridAdam(self._model.parameters(), lr=5e-6)
+        else:
+            self._optimizer = Adam(self._model.parameters(), lr=5e-6)
+
+    def _prepare_model_with_strategy(self, has_optimizer: bool):
+        if has_optimizer:
+            self._init_optimizer()
+            (self._model, self._optimizer) = self._strategy.prepare((self._model, self._optimizer))
+        else:
+            self._model = self._strategy.prepare(self._model)
+
+    def _load_model_from_pretrained(self, model_class: Type[LoRAModule], pretrain: str):
+        raise NotImplementedError()
+
+    def init_model_from_pretrained(self,
+                                   strategy: str,
+                                   model_class: Type[LoRAModule],
+                                   pretrain: str,
+                                   has_optimizer=False):
+        self._init_strategy(strategy)
+        self._load_model_from_pretrained(model_class, pretrain)
+        self._prepare_model_with_strategy(has_optimizer)
+
+    def eval(self):
+        self._model.eval()
+
+
+class TrainablePPORole(BasePPORole):
+
+    def _load_model_from_pretrained(self, model_class, pretrain):
+        with self._strategy.model_init_context():
+            self._model = model_class(pretrain).to(torch.cuda.current_device())
+
+    def _train(self):
+        self._model.train()
+
+    def _training_step(self, experience: Experience):
+        raise NotImplementedError()
+
+    def learn_on_experiences(self, experience_refs):
+        experiences = ray.get(experience_refs)
+        device = torch.cuda.current_device()
+        self._train()
+        for exp in experiences:
+            exp.to_device(device)
+            self._training_step(exp)
+        self.eval()
+
+
+@ray.remote(num_gpus=1)
+class RayPPOActor(TrainablePPORole):
+
+    def set_loss_function(self, eps_clip: float):
+        self._actor_loss_fn = PolicyLoss(eps_clip)
+
+    def load_tokenizer_from_pretrained(self, model_type: str, pretrained):
+        if model_type == 'gpt2':
+            self._model_tokenizer = GPT2Tokenizer.from_pretrained(pretrained)
+            self._model_tokenizer.pad_token = self._model_tokenizer.eos_token
+        elif model_type == 'bloom':
+            self._model_tokenizer = BloomTokenizerFast.from_pretrained(pretrained)
+            self._model_tokenizer.pad_token = self._model_tokenizer.eos_token
+        elif model_type == 'opt':
+            self._model_tokenizer = AutoTokenizer.from_pretrained(pretrained)
+        else:
+            raise ValueError(f'Unsupported model "{model_type}"')
+
+        # Set tokenize function for sequence generation
+        def _text_input_tokenize_fn(texts):
+            batch = self._model_tokenizer(texts, return_tensors='pt', max_length=96, padding=True, truncation=True)
+            return {k: v.cuda() for k, v in batch.items()}
+
+        self._sample_tokenize_function = _text_input_tokenize_fn
+
+    def setup_generate_kwargs(self, generate_kwargs: dict):
+        from coati.trainer.ppo import _set_default_generate_kwargs
+        self._generate_kwargs = _set_default_generate_kwargs(self._strategy, generate_kwargs, self._model)
+        self._generate_kwargs['pad_token_id'] = self._model_tokenizer.pad_token_id
+        self._generate_kwargs['eos_token_id'] = self._model_tokenizer.eos_token_id
+
+    def load_csv_prompt_file_from_url_to_sampler(self, prompt_url):
+        import pandas as pd
+        prompts = pd.read_csv(prompt_url)['prompt']
+        self._sampler = self._strategy.setup_sampler(prompts)
+
+    def _generate(self, input_ids, **generate_kwargs):
+        return self._model.generate(input_ids, return_action_mask=True, **generate_kwargs)
+
+    def sample_prompts_and_make_sequence(self, experience_batch_size):
+        sampled_prompts = self._sampler.sample(experience_batch_size)
+        input_ids = self._sample_tokenize_function(sampled_prompts)
+        if isinstance(input_ids, dict):
+            return self._generate(**input_ids, **self._generate_kwargs)
+        else:
+            return self._generate(input_ids, **self._generate_kwargs)
+
+    @torch.no_grad()
+    def calculate_action_log_probs(self, sequence_attention_action_mask):
+        sequences, attention_mask, action_mask = sequence_attention_action_mask
+        return self._model.forward(sequences, action_mask.size(1), attention_mask)
+
+    def _training_step(self, experience):
+        num_actions = experience.action_mask.size(1)
+        action_log_probs = self._model(experience.sequences, num_actions, attention_mask=experience.attention_mask)
+        actor_loss = self._actor_loss_fn(action_log_probs,
+                                         experience.action_log_probs,
+                                         experience.advantages,
+                                         action_mask=experience.action_mask)
+        self._strategy.backward(actor_loss, self._model, self._optimizer)
+        self._strategy.optimizer_step(self._optimizer)
+        self._optimizer.zero_grad()
+        logging.info("actor_loss: {}".format(actor_loss))
+
+    def save_checkpoint(self, save_path, should_save_optimizer: bool):
+        if self._rank == 0:
+            # save model checkpoint only on rank 0
+            self._strategy.save_model(self._model, save_path, only_rank0=True)
+        # save optimizer checkpoint on all ranks
+        if should_save_optimizer:
+            self._strategy.save_optimizer(self._optimizer,
+                                          'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
+                                          only_rank0=False)
+
+    def generate_answer(self, prompt, max_length=30, num_return_sequences=5):
+        encoded_input = self._model_tokenizer(prompt, return_tensors='pt')
+        input_ids = {k: v.cuda() for k, v in encoded_input.items()}
+        sequence, _ = self._model.generate(**input_ids,
+                                           max_length=max_length,
+                                           return_action_mask=False,
+                                           num_return_sequences=num_return_sequences)
+        token_list = list(sequence.data[0])
+        output = " ".join([self._model_tokenizer.decode(token) for token in token_list])
+        return output
+
+
+@ray.remote(num_gpus=1)
+class RayPPOCritic(TrainablePPORole):
+
+    def set_loss_function(self, value_clip: float):
+        self._critic_loss_fn = ValueLoss(value_clip)
+
+    def _training_step(self, experience):
+        values = self._model(experience.sequences,
+                             action_mask=experience.action_mask,
+                             attention_mask=experience.attention_mask)
+        critic_loss = self._critic_loss_fn(values,
+                                           experience.values,
+                                           experience.reward,
+                                           action_mask=experience.action_mask)
+        self._strategy.backward(critic_loss, self._model, self._optimizer)
+        self._strategy.optimizer_step(self._optimizer)
+        self._optimizer.zero_grad()
+        logging.info("critic_loss: {}".format(critic_loss))
+
+    @torch.no_grad()
+    def calculate_value(self, sequence_attention_action_mask):
+        sequences, attention_mask, action_mask = sequence_attention_action_mask
+        return self._model(sequences, action_mask, attention_mask)
+
+
+@ray.remote(num_gpus=1)
+class RayPPORewardModel(BasePPORole):
+
+    def _load_model_from_pretrained(self, model_class, pretrain):
+        with self._strategy.model_init_context():
+            critic = model_class(pretrained=pretrain).to(torch.cuda.current_device())
+            self._model = RewardModel(deepcopy(critic.model),
+                                      deepcopy(critic.value_head)).to(torch.cuda.current_device())
+
+    @torch.no_grad()
+    def calculate_r(self, sequence_attention_action_mask):
+        sequences, attention_mask, _ = sequence_attention_action_mask
+        return self._model(sequences, attention_mask)
+
+
+@ray.remote(num_gpus=1)
+class RayPPOInitialModel(BasePPORole):
+
+    def _load_model_from_pretrained(self, model_class, pretrain):
+        with self._strategy.model_init_context():
+            self._model = model_class(pretrain).to(torch.cuda.current_device())
+
+    @torch.no_grad()
+    def calculate_base_action_log_probs(self, sequence_attention_action_mask):
+        sequences, attention_mask, action_mask = sequence_attention_action_mask
+        return self._model(sequences, action_mask.size(1), attention_mask)
+
+
+class PPORayActorGroup:
+    """
+        A group of ray actors
+        Functions start with 'async' should return list of object refs
+    """
+
+    def __init__(self, num_nodes, num_gpus_per_node, ray_actor_type: Type[BasePPORole]) -> None:
+        self._num_nodes = num_nodes
+        self._num_gpus_per_node = num_gpus_per_node
+        self.ray_actor_type = ray_actor_type
+        self._initiate_actors()
+
+    def _initiate_actors(self):
+        world_size = self._num_nodes * self._num_gpus_per_node
+        # Use placement group to lock resources for models of same type
+        pg = None
+        if self._num_gpus_per_node > 1:
+            bundles = [{"GPU": self._num_gpus_per_node, "CPU": self._num_gpus_per_node} for _ in range(self._num_nodes)]
+            pg = placement_group(bundles, strategy="STRICT_SPREAD")
+            ray.get(pg.ready())
+        if pg:
+            master_actor = self.ray_actor_type.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
+                placement_group=pg, placement_group_bundle_index=0)).remote(world_size, 0, 0, None, None)
+        else:
+            master_actor = self.ray_actor_type.options(num_gpus=1).remote(world_size, 0, 0, None, None)
+        self._actor_handlers = [master_actor]
+
+        # Create worker actors
+        if world_size > 1:
+            master_addr, master_port = ray.get(master_actor.get_master_addr_port.remote())
+            for rank in range(1, world_size):
+                local_rank = rank % self._num_gpus_per_node
+                if pg:
+                    worker_actor = self.ray_actor_type.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
+                        placement_group=pg, placement_group_bundle_index=rank // self._num_gpus_per_node)).remote(
+                            world_size, rank, local_rank, master_addr, master_port)
+                else:
+                    worker_actor = self.ray_actor_type.options(num_gpus=1).remote(world_size, rank, local_rank,
+                                                                                  master_addr, master_port)
+                self._actor_handlers.append(worker_actor)
+
+    def async_init_model_from_pretrained(self, strategy: str, model_class: Type[LoRAModule], pretrain: str,
+                                         has_optimizer: bool):
+        return [
+            actor.init_model_from_pretrained.remote(strategy, model_class, pretrain, has_optimizer)
+            for actor in self._actor_handlers
+        ]
+
+
+class TrainableModelRayActorGroup(PPORayActorGroup):
+
+    def async_learn_on_experiences(self, experience_refs):
+        num_actors = len(self._actor_handlers)
+        learn_result_refs = []
+        for i in range(num_actors):
+            exp_refs_batch = experience_refs[i::num_actors]
+            learn_result_refs.append(self._actor_handlers[i].learn_on_experiences.remote(exp_refs_batch))
+        return learn_result_refs
+
+
+class PPOActorRayActorGroup(TrainableModelRayActorGroup):
+
+    def __init__(self, num_nodes, num_gpus_per_node) -> None:
+        super().__init__(num_nodes, num_gpus_per_node, RayPPOActor)
+
+    def async_prepare_for_sequence_generation(self, model: str, pretrain: str, generation_kwargs: dict):
+        refs = []
+        for actor in self._actor_handlers:
+            refs.append(actor.load_tokenizer_from_pretrained.remote(model, pretrain))
+            refs.append(actor.setup_generate_kwargs.remote(generation_kwargs))
+        return refs
+
+    def load_csv_prompt_file_from_url_to_sampler(self, csv_url):
+        ray.get([actor.load_csv_prompt_file_from_url_to_sampler.remote(csv_url) for actor in self._actor_handlers])
+
+    def async_sample_prompts_and_make_sequence(self, experience_batch_size):
+        return [actor.sample_prompts_and_make_sequence.remote(experience_batch_size) for actor in self._actor_handlers]
+
+    def async_calculate_action_log_probs(self, sequences_attention_mask_action_mask_refs):
+        num_actors = len(self._actor_handlers)
+        action_log_probs_refs = []
+        for i in range(len(sequences_attention_mask_action_mask_refs)):
+            action_log_probs_ref = self._actor_handlers[i % num_actors].calculate_action_log_probs.remote(
+                sequences_attention_mask_action_mask_refs[i])
+            action_log_probs_refs.append(action_log_probs_ref)
+        return action_log_probs_refs
+
+    def set_loss_function(self, eps_clip: float = 0.2):
+        ray.get([actor.set_loss_function.remote(eps_clip) for actor in self._actor_handlers])
+
+    def save_checkpoint(self, save_path, should_save_optimizer):
+        ray.get([actor.save_checkpoint.remote(save_path, should_save_optimizer) for actor in self._actor_handlers])
+
+
+class PPOCriticRayActorGroup(TrainableModelRayActorGroup):
+
+    def __init__(self, num_nodes, num_gpus_per_node) -> None:
+        super().__init__(num_nodes, num_gpus_per_node, RayPPOCritic)
+
+    def async_calculate_value(self, sequences_attention_mask_action_mask_refs):
+        num_actors = len(self._actor_handlers)
+        value_refs = []
+        for i in range(len(sequences_attention_mask_action_mask_refs)):
+            value_ref = self._actor_handlers[i % num_actors].calculate_value.remote(
+                sequences_attention_mask_action_mask_refs[i])
+            value_refs.append(value_ref)
+        return value_refs
+
+    def set_loss_function(self, value_clip: float = 0.4):
+        ray.get([actor.set_loss_function.remote(value_clip) for actor in self._actor_handlers])
+
+
+class PPOInitialRayActorGroup(PPORayActorGroup):
+
+    def __init__(self, num_nodes, num_gpus_per_node) -> None:
+        super().__init__(num_nodes, num_gpus_per_node, RayPPOInitialModel)
+
+    def async_calculate_base_action_log_probs(self, sequences_attention_mask_action_mask_refs):
+        num_actors = len(self._actor_handlers)
+        base_action_log_probs_refs = []
+        for i in range(len(sequences_attention_mask_action_mask_refs)):
+            base_action_log_probs_ref = self._actor_handlers[i % num_actors].calculate_base_action_log_probs.remote(
+                sequences_attention_mask_action_mask_refs[i])
+            base_action_log_probs_refs.append(base_action_log_probs_ref)
+        return base_action_log_probs_refs
+
+
+class PPORewardRayActorGroup(PPORayActorGroup):
+
+    def __init__(self, num_nodes, num_gpus_per_node) -> None:
+        super().__init__(num_nodes, num_gpus_per_node, RayPPORewardModel)
+
+    def async_calculate_r(self, sequences_attention_mask_action_mask_refs):
+        num_actors = len(self._actor_handlers)
+        r_refs = []
+        for i in range(len(sequences_attention_mask_action_mask_refs)):
+            r_ref = self._actor_handlers[i % num_actors].calculate_r.remote(
+                sequences_attention_mask_action_mask_refs[i])
+            r_refs.append(r_ref)
+        return r_refs
+
+
+def main(args):
+    logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
+                        level=logging.INFO,
+                        datefmt='%Y-%m-%d %H:%M:%S')
+    if args.model == 'gpt2':
+        actor_model_class, critic_model_class = GPTActor, GPTCritic
+    elif args.model == 'bloom':
+        actor_model_class, critic_model_class = BLOOMActor, BLOOMCritic
+    elif args.model == 'opt':
+        actor_model_class, critic_model_class = OPTActor, OPTCritic
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+
+    logging.info("Start creating actors")
+    # Initialize 4 models (actor, critic, initial_model and reward_model)
+    actor_group = PPOActorRayActorGroup(num_nodes=args.num_actor_nodes, num_gpus_per_node=args.num_gpus_per_node)
+    critic_group = PPOCriticRayActorGroup(num_nodes=args.num_critic_nodes, num_gpus_per_node=args.num_gpus_per_node)
+    initial_group = PPOInitialRayActorGroup(num_nodes=args.num_initial_nodes, num_gpus_per_node=args.num_gpus_per_node)
+    reward_group = PPORewardRayActorGroup(num_nodes=args.num_reward_nodes, num_gpus_per_node=args.num_gpus_per_node)
+    logging.info("Actors created")
+
+    # Prepare model for training
+    generate_kwargs = {'max_length': 128, 'do_sample': True, 'temperature': 1.0, 'top_k': 50}
+    ray.get(
+        actor_group.async_init_model_from_pretrained(args.strategy, actor_model_class, args.pretrain, True) +
+        critic_group.async_init_model_from_pretrained(args.strategy, critic_model_class, args.pretrain, True) +
+        initial_group.async_init_model_from_pretrained(args.strategy, actor_model_class, args.pretrain, False) +
+        reward_group.async_init_model_from_pretrained(args.strategy, critic_model_class, args.pretrain, False) +
+        actor_group.async_prepare_for_sequence_generation(args.model, args.pretrain, generate_kwargs))
+    logging.info("Models prepared for training")
+
+    # Prepare models for training
+    actor_group.load_csv_prompt_file_from_url_to_sampler(args.prompt_csv_url)
+    actor_group.set_loss_function()
+    critic_group.set_loss_function()
+    # Training parameter
+    num_episodes = args.num_episodes
+    max_timesteps = args.max_timesteps
+    update_timesteps = args.update_timesteps
+    experience_batch_size = args.experience_batch_size
+    # Start training
+    logging.info("Training start")
+    # Set all models to eval and add experience maker
+    all_ray_actors = actor_group._actor_handlers + critic_group._actor_handlers + \
+        initial_group._actor_handlers + reward_group._actor_handlers
+    num_ray_actors = len(all_ray_actors)
+    ray.get([ray_actor.eval.remote() for ray_actor in all_ray_actors])
+    ray.get([ray_actor.add_experience_maker.remote() for ray_actor in all_ray_actors])
+    # Used as a queue to coordinate experience making
+    experience_composition_refs = []
+    time = 0
+    for episode in range(num_episodes):
+        logging.info("episode {} started".format(episode))
+        for _ in range(max_timesteps):
+            time += 1
+            # Experience queueing stage
+            sequences_attention_mask_action_mask_refs = actor_group.async_sample_prompts_and_make_sequence(
+                experience_batch_size)
+            base_action_log_probs_refs = initial_group.async_calculate_base_action_log_probs(
+                sequences_attention_mask_action_mask_refs)
+            values_refs = critic_group.async_calculate_value(sequences_attention_mask_action_mask_refs)
+            r_refs = reward_group.async_calculate_r(sequences_attention_mask_action_mask_refs)
+            action_log_probs_refs = actor_group.async_calculate_action_log_probs(
+                sequences_attention_mask_action_mask_refs)
+            experience_composition_refs.extend([
+                ExperienceCompositionRefs(sequences_attention_mask_action_mask_refs[i], action_log_probs_refs[i],
+                                          base_action_log_probs_refs[i], values_refs[i], r_refs[i])
+                for i in range(len(sequences_attention_mask_action_mask_refs))
+            ])
+            # Learning stage
+            if time % update_timesteps == 0:
+                experience_refs = []
+                # calculate experiences
+                for i, experience_composition_ref in enumerate(experience_composition_refs):
+                    exp_composition_ref = experience_composition_ref
+                    selected_ray_actor = all_ray_actors[i % num_ray_actors]
+                    experience_refs.append(selected_ray_actor.make_experience.remote(exp_composition_ref))
+                # backward
+                ray.get(
+                    actor_group.async_learn_on_experiences(experience_refs) +
+                    critic_group.async_learn_on_experiences(experience_refs))
+                # clear refs queue
+                experience_composition_refs.clear()
+    logging.info("Training finished")
+    # Save checkpoint
+    actor_group.save_checkpoint(args.save_path, args.need_optim_ckpt)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt_csv_url', type=str)
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
+    parser.add_argument('--pretrain', type=str, default='gpt2')
+    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
+    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
+    parser.add_argument('--num_episodes', type=int, default=10)
+    parser.add_argument('--max_timesteps', type=int, default=10)
+    parser.add_argument('--update_timesteps', type=int, default=10)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--num_actor_nodes', type=int, help='num of nodes to use to host actor model', default=1)
+    parser.add_argument('--num_critic_nodes', type=int, help='num of nodes to use to host critic model', default=1)
+    parser.add_argument('--num_initial_nodes', type=int, help='num of nodes to use to host initial model', default=1)
+    parser.add_argument('--num_reward_nodes', type=int, help='num of nodes to use to host reward model', default=1)
+    parser.add_argument('--num_gpus_per_node', type=int, help='num of gpus on a ray node', default=1)
+    args = parser.parse_args()
+    ray.init()
+    main(args)
--- a/applications/Chat/examples/example_data_reformat.py
+++ b/applications/Chat/examples/example_data_reformat.py
+jsonl_file = 'seed_prompts_xx.jsonl'  # seed_prompts_en.jsonl or seed_prompts_ch.json from InstructionWild
+reformat_file = 'prompts_xx.jsonl'  # reformat jsonl file used as Prompt dataset in Stage3
+
+data = ''
+with open(jsonl_file, 'r', encoding="utf-8") as f1:
+    for jsonstr in f1.readlines():
+        jsonstr = '\t' + jsonstr.strip('\n') + ',\n'
+        data = data + jsonstr
+    data = '[\n' + data + ']'
+
+with open(reformat_file, 'w') as f2:
+    f2.write(data)
\ No newline at end of file
--- a/applications/Chat/examples/inference.py
+++ b/applications/Chat/examples/inference.py
+import argparse
+
+import torch
+from coati.models.bloom import BLOOMActor
+from coati.models.gpt import GPTActor
+from coati.models.opt import OPTActor
+from coati.models.roberta import RoBERTaActor
+from transformers import AutoTokenizer, RobertaTokenizer
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+
+def eval(args):
+    # configure model
+    if args.model == 'gpt2':
+        actor = GPTActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+    elif args.model == 'bloom':
+        actor = BLOOMActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+    elif args.model == 'opt':
+        actor = OPTActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+    elif args.model == 'roberta':
+        actor = RoBERTaActor(pretrained=args.pretrain).to(torch.cuda.current_device())
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+
+    state_dict = torch.load(args.model_path)
+    actor.model.load_state_dict(state_dict)
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'bloom':
+        tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-560m')
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
+    elif args.model == 'roberta':
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+
+    actor.eval()
+    input = args.input
+    input_ids = tokenizer.encode(input, return_tensors='pt').to(torch.cuda.current_device())
+    outputs = actor.generate(input_ids,
+                             max_length=args.max_length,
+                             do_sample=True,
+                             top_k=50,
+                             top_p=0.95,
+                             num_return_sequences=1)
+    output = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
+    print(output)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'roberta'])
+    # We suggest to use the pretrained model from HuggingFace, use pretrain to configure model
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--model_path', type=str, default=None)
+    parser.add_argument('--input', type=str, default='Question: How are you ? Answer:')
+    parser.add_argument('--max_length', type=int, default=100)
+    args = parser.parse_args()
+    eval(args)
--- a/applications/Chat/examples/requirements.txt
+++ b/applications/Chat/examples/requirements.txt
+pandas>=1.4.1
+sentencepiece
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
+#!/usr/bin/env bash
+
+set -xue
+
+if [ -z "$SFT_DATASET" ]; then
+    echo "Please set \$SFT_DATASET to the path to sft dataset."
+    exit 1
+fi
+
+if [ -z "$PROMPT_PATH" ]; then
+    echo "Please set \$PROMPT_PATH to the path to prompts csv."
+    exit 1
+fi
+
+if [ -z "$PRETRAIN_DATASET" ]; then
+    echo "Please set \$PRETRAIN_DATASET to the path to alpaca data."
+    exit 1
+fi
+
+BASE=$(realpath $(dirname $0))
+
+export OMP_NUM_THREADS=8
+
+# install requirements
+pip install -r ${BASE}/requirements.txt
+
+wandb init -m offline
+
+# train sft
+torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'bigscience/bloom-560m' \
+        --model 'bloom' --strategy colossalai_zero2 --lora_rank 4\
+        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+        --save_path ${BASE}/output
+rm -rf ${BASE}/output
+
+torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'gpt2' \
+        --model 'gpt2' --strategy colossalai_zero2 \
+        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+        --save_path ${BASE}/output
+rm -rf ${BASE}/output
+
+torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'facebook/opt-350m' \
+        --model 'opt' --strategy colossalai_zero2 --lora_rank 4\
+        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+        --save_path ${BASE}/output
+rm -rf ${BASE}/output
+
+torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'gpt2' \
+        --model 'gpt2' --strategy ddp --lora_rank 4\
+        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+        --save_path ${BASE}/output
+
+#torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'facebook/opt-350m' \
+#        --model 'opt' --strategy naive \
+#        --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \
+#        --save_path ${BASE}/output
+
+rm -rf ${BASE}/output
+
+# train rm
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                            --pretrain 'facebook/opt-350m' --model 'opt' \
+                            --strategy colossalai_zero2 --loss_fn 'log_sig'\
+                            --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
+                            --test True --lora_rank 0 \
+                            --save_path ${BASE}/rm_ckpt_opt.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                            --pretrain 'gpt2' --model 'gpt2' \
+                            --strategy colossalai_zero2 --loss_fn 'log_exp' \
+                            --dataset 'Dahoas/rm-static' \
+                            --test True  --lora_rank 0 \
+                            --save_path ${BASE}/rm_ckpt_gpt.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                            --pretrain 'gpt2' --model 'gpt2' \
+                            --strategy ddp --loss_fn 'log_exp' \
+                            --dataset 'Dahoas/rm-static' \
+                            --test True --lora_rank 4 \
+                            --save_path ${BASE}/rm_ckpt.pt
+rm -rf ${BASE}/rm_ckpt.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                            --pretrain 'bigscience/bloom-560m' --model 'bloom' \
+                            --strategy colossalai_zero2 --loss_fn 'log_sig' \
+                            --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
+                            --test True --lora_rank 4 \
+                            --save_path ${BASE}/rm_ckpt.pt
+rm -rf ${BASE}/rm_ckpt.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                            --pretrain 'microsoft/deberta-v3-large' --model 'deberta' \
+                            --strategy colossalai_zero2 --loss_fn 'log_sig' \
+                            --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \
+                            --test True --lora_rank 4 \
+                            --save_path ${BASE}/rm_ckpt.pt
+rm -rf ${BASE}/rm_ckpt.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \
+                            --pretrain 'roberta-base' --model 'roberta' \
+                            --strategy colossalai_zero2 --loss_fn 'log_exp'\
+                            --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base'\
+                            --test True --lora_rank 4 \
+                            --save_path ${BASE}/rm_ckpt.pt
+
+rm -rf ${BASE}/rm_ckpt.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+        --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
+        --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+        --pretrain 'facebook/opt-350m' --model opt \
+        --rm_pretrain 'facebook/opt-350m' \
+        --rm_path ${BASE}/rm_ckpt_opt.pt \
+        --save_path ${BASE}/actor_checkpoint_prompts.pt
+rm -rf ${BASE}/rm_ckpt_opt.pt
+
+torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+         --strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
+         --update_timesteps 2 --max_epochs 1 --train_batch_size 2 \
+         --pretrain 'gpt2' --model gpt2 \
+         --rm_pretrain 'gpt2' \
+         --rm_path ${BASE}/rm_ckpt_gpt.pt \
+         --save_path ${BASE}/actor_checkpoint_prompts.pt
+rm -rf ${BASE}/rm_ckpt_gpt.pt
+
+rm -rf ${BASE}/actor_checkpoint_prompts.pt
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
+import argparse
+
+import pandas as pd
+import torch
+import torch.distributed as dist
+from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
+from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
+from coati.models.gpt import GPTRM, GPTActor, GPTCritic
+from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
+from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM
+from coati.trainer import PPOTrainer
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.utils import prepare_llama_tokenizer_and_embedding
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer, RobertaTokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+
+
+def main(args):
+    # configure strategy
+    if args.strategy == 'naive':
+        strategy = NaiveStrategy()
+    elif args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    if args.rm_path is not None:
+        state_dict = torch.load(args.rm_path, map_location='cpu')
+
+    # configure model
+    if args.model == 'gpt2':
+        initial_model = GPTActor(pretrained=args.pretrain)
+    elif args.model == 'bloom':
+        initial_model = BLOOMActor(pretrained=args.pretrain)
+    elif args.model == 'opt':
+        initial_model = OPTActor(pretrained=args.pretrain)
+    elif args.model == 'llama':
+        initial_model = LlamaActor(pretrained=args.pretrain)
+    elif args.model == 'roberta':
+        initial_model = RoBERTaActor(pretrained=args.pretrain)
+    else:
+        raise ValueError(f'Unsupported actor model "{args.model}"')
+
+    if args.rm_model == None:
+        rm_model_name = args.model
+    else:
+        rm_model_name = args.rm_model
+
+    if rm_model_name == 'gpt2':
+        reward_model = GPTRM(pretrained=args.rm_pretrain)
+    elif rm_model_name == 'bloom':
+        reward_model = BLOOMRM(pretrained=args.rm_pretrain)
+    elif rm_model_name == 'opt':
+        reward_model = OPTRM(pretrained=args.rm_pretrain)
+    elif rm_model_name == 'llama':
+        reward_model = LlamaRM(pretrained=args.rm_pretrain)
+    elif rm_model_name == 'roberta':
+        reward_model = RoBERTaRM(pretrained=args.rm_pretrain)
+    else:
+        raise ValueError(f'Unsupported reward model "{rm_model_name}"')
+
+    if args.rm_path is not None:
+        reward_model.load_state_dict(state_dict)
+
+    initial_model.to(torch.float16).to(torch.cuda.current_device())
+    reward_model.to(torch.float16).to(torch.cuda.current_device())
+
+    with strategy.model_init_context():
+        if args.model == 'gpt2':
+            actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+        elif args.model == 'bloom':
+            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+        elif args.model == 'opt':
+            actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+        elif args.model == 'llama':
+            actor = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+        elif args.model == 'roberta':
+            actor = RoBERTaActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+        else:
+            raise ValueError(f'Unsupported actor model "{args.model}"')
+
+        if rm_model_name == 'gpt2':
+            critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
+        elif rm_model_name == 'bloom':
+            critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
+        elif rm_model_name == 'opt':
+            critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
+        elif rm_model_name == 'llama':
+            critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
+        elif rm_model_name == 'roberta':
+            critic = RoBERTaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
+        else:
+            raise ValueError(f'Unsupported reward model "{rm_model_name}"')
+
+        if args.rm_path is not None:
+            critic.load_state_dict(state_dict)
+            del state_dict
+
+    if args.strategy != 'colossalai_gemini':
+        critic.to(torch.float16).to(torch.cuda.current_device())
+        actor.to(torch.float16).to(torch.cuda.current_device())
+
+    # configure optimizer
+    if args.strategy.startswith('colossalai'):
+        actor_optim = HybridAdam(actor.parameters(), lr=1e-7)
+        critic_optim = HybridAdam(critic.parameters(), lr=1e-7)
+    else:
+        actor_optim = Adam(actor.parameters(), lr=1e-7)
+        critic_optim = Adam(critic.parameters(), lr=1e-7)
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    elif args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif args.model == 'llama':
+        tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
+        tokenizer.eos_token = '<\s>'
+    elif args.model == 'roberta':
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+
+    if args.model == 'llama':
+        tokenizer = prepare_llama_tokenizer_and_embedding(tokenizer, actor)
+    else:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+
+    prompt_dataset = PromptDataset(tokenizer=tokenizer, data_path=args.prompt_dataset, max_datasets_size=16384)
+    if dist.is_initialized() and dist.get_world_size() > 1:
+        prompt_sampler = DistributedSampler(prompt_dataset, shuffle=True, seed=42, drop_last=True)
+    else:
+        prompt_sampler = None
+    prompt_dataloader = DataLoader(prompt_dataset,
+                                   shuffle=(prompt_sampler is None),
+                                   sampler=prompt_sampler,
+                                   batch_size=args.experience_batch_size)
+
+    pretrain_dataset = SupervisedDataset(tokenizer=tokenizer,
+                                         data_path=args.pretrain_dataset,
+                                         max_datasets_size=16384,
+                                         max_length=args.max_input_len)
+    if dist.is_initialized() and dist.get_world_size() > 1:
+        pretrain_sampler = DistributedSampler(pretrain_dataset, shuffle=True, seed=42, drop_last=True)
+    else:
+        pretrain_sampler = None
+    pretrain_dataloader = DataLoader(pretrain_dataset,
+                                     shuffle=(pretrain_sampler is None),
+                                     sampler=pretrain_sampler,
+                                     batch_size=args.ptx_batch_size,
+                                     collate_fn=data_collator)
+
+    (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
+
+    # configure trainer
+    trainer = PPOTrainer(
+        strategy,
+        actor,
+        critic,
+        reward_model,
+        initial_model,
+        actor_optim,
+        critic_optim,
+        kl_coef=args.kl_coef,
+        ptx_coef=args.ptx_coef,
+        max_epochs=args.max_epochs,
+        train_batch_size=args.train_batch_size,
+        max_length=args.max_seq_len,
+        use_cache=True,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+
+    trainer.fit(prompt_dataloader=prompt_dataloader,
+                pretrain_dataloader=pretrain_dataloader,
+                num_episodes=args.num_episodes,
+                max_timesteps=args.max_timesteps,
+                update_timesteps=args.update_timesteps)
+
+    # save model checkpoint after fitting
+    strategy.save_model(actor, args.save_path, only_rank0=True)
+    # save optimizer checkpoint on all ranks
+    if args.need_optim_ckpt:
+        strategy.save_optimizer(actor_optim,
+                                'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
+                                only_rank0=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt_dataset', type=str, default=None, help='path to the prompt dataset')
+    parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset')
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='colossalai_zero2',
+                        help='strategy to use')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama', 'roberta'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--rm_model', default=None, choices=['gpt2', 'bloom', 'opt', 'llama', 'roberta'])
+    parser.add_argument('--rm_path', type=str, default=None)
+    parser.add_argument('--rm_pretrain', type=str, default=None)
+    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts')
+    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
+    parser.add_argument('--num_episodes', type=int, default=10)
+    parser.add_argument('--max_timesteps', type=int, default=10)
+    parser.add_argument('--update_timesteps', type=int, default=10)
+    parser.add_argument('--max_epochs', type=int, default=5)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--ptx_batch_size', type=int, default=1)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument('--kl_coef', type=float, default=0.1)
+    parser.add_argument('--ptx_coef', type=float, default=0.9)
+    parser.add_argument('--max_input_len', type=int, default=96)
+    parser.add_argument('--max_seq_len', type=int, default=128)
+    args = parser.parse_args()
+    main(args)
--- a/applications/Chat/examples/train_prompts.sh
+++ b/applications/Chat/examples/train_prompts.sh
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
+        | tail -n +2 \
+        | nl -v 0 \
+        | tee /dev/tty \
+        | sort -g -k 2 \
+        | awk '{print $1}' \
+        | head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
+
+# torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai_zero2
+
+torchrun --standalone --nproc_per_node=2 train_prompts.py --prompt_dataset /path/to/data.json --strategy colossalai_zero2
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
+import argparse
+from random import randint
+
+import loralib as lora
+import torch
+import torch.distributed as dist
+from coati.dataset import HhRlhfDataset, RmStaticDataset
+from coati.models import LogExpLoss, LogSigLoss
+from coati.models.base import RewardModel
+from coati.models.bloom import BLOOMRM
+from coati.models.deberta import DebertaRM
+from coati.models.gpt import GPTRM
+from coati.models.llama import LlamaRM
+from coati.models.opt import OPTRM
+from coati.models.roberta import RoBERTaRM
+from coati.trainer import RewardModelTrainer
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.utils import prepare_llama_tokenizer_and_embedding
+from datasets import load_dataset
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from transformers import AutoTokenizer, BloomTokenizerFast, DebertaV2Tokenizer, LlamaTokenizer, RobertaTokenizer
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+
+
+def train(args):
+    # configure strategy
+    if args.strategy == 'naive':
+        strategy = NaiveStrategy()
+    elif args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    # configure model
+    with strategy.model_init_context():
+        if args.model == 'bloom':
+            model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'opt':
+            model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'gpt2':
+            model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'deberta':
+            model = DebertaRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'llama':
+            model = LlamaRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        elif args.model == 'roberta':
+            model = RoBERTaRM(pretrained=args.pretrain, lora_rank=args.lora_rank).to(torch.cuda.current_device())
+        else:
+            raise ValueError(f'Unsupported model "{args.model}"')
+
+        if args.model_path is not None:
+            state_dict = torch.load(args.model_path)
+            model.load_state_dict(state_dict)
+
+    model = model.to(torch.float16)
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    elif args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif args.model == 'deberta':
+        tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
+    elif args.model == 'llama':
+        tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
+    elif args.model == 'roberta':
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+    max_len = args.max_len
+
+    if args.model == 'llama':
+        tokenizer = prepare_llama_tokenizer_and_embedding(tokenizer, model)
+    else:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # configure optimizer
+    if args.strategy.startswith('colossalai'):
+        optim = HybridAdam(model.parameters(), lr=5e-6)
+    else:
+        optim = Adam(model.parameters(), lr=5e-6)
+
+    # configure loss function
+    if args.loss_fn == 'log_sig':
+        loss_fn = LogSigLoss()
+    elif args.loss_fn == 'log_exp':
+        loss_fn = LogExpLoss()
+    else:
+        raise ValueError(f'Unsupported loss function "{args.loss_fn}"')
+
+    # prepare for data and dataset
+    if args.subset is not None:
+        data = load_dataset(args.dataset, data_dir=args.subset)
+    else:
+        data = load_dataset(args.dataset)
+
+    if args.test:
+        train_data = data['train'].select(range(100))
+        eval_data = data['test'].select(range(10))
+    else:
+        train_data = data['train']
+        eval_data = data['test']
+    valid_data = data['test'].select((randint(0, len(eval_data) - 1) for _ in range(len(eval_data) // 5)))
+
+    if args.dataset == 'Dahoas/rm-static':
+        train_dataset = RmStaticDataset(train_data, tokenizer, max_len)
+        valid_dataset = RmStaticDataset(valid_data, tokenizer, max_len)
+        eval_dataset = RmStaticDataset(eval_data, tokenizer, max_len)
+    elif args.dataset == 'Anthropic/hh-rlhf':
+        train_dataset = HhRlhfDataset(train_data, tokenizer, max_len)
+        valid_dataset = HhRlhfDataset(valid_data, tokenizer, max_len)
+        eval_dataset = HhRlhfDataset(eval_data, tokenizer, max_len)
+    else:
+        raise ValueError(f'Unsupported dataset "{args.dataset}"')
+
+    if dist.is_initialized() and dist.get_world_size() > 1:
+        train_sampler = DistributedSampler(train_dataset,
+                                           shuffle=True,
+                                           seed=42,
+                                           drop_last=True,
+                                           rank=dist.get_rank(),
+                                           num_replicas=dist.get_world_size())
+        valid_sampler = DistributedSampler(valid_dataset,
+                                           shuffle=True,
+                                           seed=42,
+                                           drop_last=True,
+                                           rank=dist.get_rank(),
+                                           num_replicas=dist.get_world_size())
+        eval_sampler = DistributedSampler(eval_dataset,
+                                          shuffle=True,
+                                          seed=42,
+                                          drop_last=True,
+                                          rank=dist.get_rank(),
+                                          num_replicas=dist.get_world_size())
+    else:
+        train_sampler = None
+        valid_sampler = None
+        eval_sampler = None
+
+    train_dataloader = DataLoader(train_dataset,
+                                  shuffle=(train_sampler is None),
+                                  sampler=train_sampler,
+                                  batch_size=args.batch_size,
+                                  pin_memory=True)
+
+    valid_dataloader = DataLoader(valid_dataset,
+                                  shuffle=(valid_sampler is None),
+                                  sampler=valid_sampler,
+                                  batch_size=args.batch_size,
+                                  pin_memory=True)
+
+    eval_dataloader = DataLoader(eval_dataset,
+                                 shuffle=(eval_sampler is None),
+                                 sampler=eval_sampler,
+                                 batch_size=args.batch_size,
+                                 pin_memory=True)
+
+    (model, optim) = strategy.prepare((model, optim))
+    trainer = RewardModelTrainer(model=model,
+                                 strategy=strategy,
+                                 optim=optim,
+                                 loss_fn=loss_fn,
+                                 train_dataloader=train_dataloader,
+                                 valid_dataloader=valid_dataloader,
+                                 eval_dataloader=eval_dataloader,
+                                 max_epochs=args.max_epochs)
+
+    trainer.fit()
+    # save model checkpoint after fitting on only rank0
+    strategy.save_model(model, args.save_path, only_rank0=True)
+    # save optimizer checkpoint on all ranks
+    if args.need_optim_ckpt:
+        strategy.save_optimizer(trainer.optimizer,
+                                'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()),
+                                only_rank0=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
+                        default='colossalai_zero2')
+    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'deberta', 'llama', 'roberta'], default='bloom')
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--model_path', type=str, default=None)
+    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
+    parser.add_argument('--dataset',
+                        type=str,
+                        choices=['Anthropic/hh-rlhf', 'Dahoas/rm-static'],
+                        default='Dahoas/rm-static')
+    parser.add_argument('--subset', type=str, default=None)
+    parser.add_argument('--save_path', type=str, default='rm_ckpt')
+    parser.add_argument('--max_epochs', type=int, default=1)
+    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--max_len', type=int, default=512)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument('--loss_fn', type=str, default='log_sig', choices=['log_sig', 'log_exp'])
+    parser.add_argument('--test', type=bool, default=False)
+    args = parser.parse_args()
+    train(args)
--- a/applications/Chat/examples/train_rm.sh
+++ b/applications/Chat/examples/train_rm.sh
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
+        | tail -n +2 \
+        | nl -v 0 \
+        | tee /dev/tty \
+        | sort -g -k 2 \
+        | awk '{print $1}' \
+        | head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
+
+torchrun --standalone --nproc_per_node=2 train_reward_model.py \
+   --pretrain  <your pretrain path> \
+   --model 'bloom' \
+   --strategy colossalai_zero2 \
+   --loss_fn 'log_sig'\
+   --save_path <your model saving path>\
+   --dataset 'Anthropic/hh-rlhf'\
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
+import argparse
+import os
+
+import loralib as lora
+import torch
+import torch.distributed as dist
+from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
+from coati.models import convert_to_lora_module
+from coati.trainer import SFTTrainer
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.utils import prepare_llama_tokenizer_and_embedding
+from datasets import load_dataset
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from transformers import AutoTokenizer, BloomConfig, BloomForCausalLM, BloomTokenizerFast, LlamaConfig, LlamaForCausalLM
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+from transformers.models.opt.configuration_opt import OPTConfig
+from transformers.models.opt.modeling_opt import OPTForCausalLM
+
+from colossalai.logging import get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.tensor import ColoParameter
+
+
+def train(args):
+    # configure strategy
+    if args.strategy == 'naive':
+        strategy = NaiveStrategy()
+    elif args.strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif args.strategy == 'colossalai_gemini':
+        raise NotImplementedError(
+            'Gemini is not supported .from_pretrained() yet. We will update this after checkpoint io is ready.')
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    elif args.strategy == 'colossalai_zero2_cpu':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
+    else:
+        raise ValueError(f'Unsupported strategy "{args.strategy}"')
+
+    # configure model
+    with strategy.model_init_context():
+        if args.model == 'bloom':
+            model = convert_to_lora_module(BloomForCausalLM.from_pretrained(args.pretrain),
+                                           args.lora_rank).half().cuda()
+        elif args.model == 'opt':
+            model = convert_to_lora_module(OPTForCausalLM.from_pretrained(args.pretrain), args.lora_rank).half().cuda()
+        elif args.model == 'gpt2':
+            model = convert_to_lora_module(GPT2LMHeadModel.from_pretrained(args.pretrain), args.lora_rank).half().cuda()
+        elif args.model == 'llama':
+            model = convert_to_lora_module(LlamaForCausalLM.from_pretrained(args.pretrain),
+                                           args.lora_rank).half().cuda()
+        else:
+            raise ValueError(f'Unsupported model "{args.model}"')
+    if args.grad_checkpoint:
+        model.gradient_checkpointing_enable()
+
+    # configure tokenizer
+    if args.model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
+        tokenizer.pad_token = tokenizer.eos_token
+    elif args.model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif args.model == 'llama':
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrain,
+            padding_side="right",
+            use_fast=False,
+        )
+        tokenizer.eos_token = '<\s>'
+    else:
+        raise ValueError(f'Unsupported model "{args.model}"')
+    tokenizer.pad_token = tokenizer.eos_token
+    max_len = args.max_len
+    if args.model == 'llama':
+        tokenizer = prepare_llama_tokenizer_and_embedding(tokenizer, model)
+
+        if args.strategy == 'colossalai_gemini':
+            # this is a hack to deal with the resized embedding
+            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
+            for name, param in model.named_parameters():
+                if not isinstance(param, ColoParameter):
+                    sub_module_name = '.'.join(name.split('.')[:-1])
+                    weight_name = name.split('.')[-1]
+                    sub_module = model.get_submodule(sub_module_name)
+                    setattr(sub_module, weight_name, ColoParameter(param))
+    else:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # configure optimizer
+    if args.strategy.startswith('colossalai'):
+        optim = HybridAdam(model.parameters(), lr=args.lr, clipping_norm=1.0)
+    else:
+        optim = Adam(model.parameters(), lr=args.lr)
+
+    logger = get_dist_logger()
+
+    # configure dataset
+    if args.dataset == 'yizhongw/self_instruct':
+        train_data = load_dataset(args.dataset, 'super_natural_instructions', split='train')
+        eval_data = load_dataset(args.dataset, 'super_natural_instructions', split='test')
+
+        train_dataset = SFTDataset(train_data, tokenizer, max_len)
+        eval_dataset = SFTDataset(eval_data, tokenizer, max_len)
+
+    else:
+        train_dataset = SupervisedDataset(tokenizer=tokenizer,
+                                          data_path=args.dataset,
+                                          max_datasets_size=args.max_datasets_size,
+                                          max_length=max_len)
+        eval_dataset = None
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+
+    if dist.is_initialized() and dist.get_world_size() > 1:
+        train_sampler = DistributedSampler(train_dataset,
+                                           shuffle=True,
+                                           seed=42,
+                                           drop_last=True,
+                                           rank=dist.get_rank(),
+                                           num_replicas=dist.get_world_size())
+        if eval_dataset is not None:
+            eval_sampler = DistributedSampler(eval_dataset,
+                                              shuffle=False,
+                                              seed=42,
+                                              drop_last=False,
+                                              rank=dist.get_rank(),
+                                              num_replicas=dist.get_world_size())
+    else:
+        train_sampler = None
+        eval_sampler = None
+
+    train_dataloader = DataLoader(train_dataset,
+                                  shuffle=(train_sampler is None),
+                                  sampler=train_sampler,
+                                  batch_size=args.batch_size,
+                                  collate_fn=data_collator,
+                                  pin_memory=True)
+    if eval_dataset is not None:
+        eval_dataloader = DataLoader(eval_dataset,
+                                     shuffle=(eval_sampler is None),
+                                     sampler=eval_sampler,
+                                     batch_size=args.batch_size,
+                                     collate_fn=data_collator,
+                                     pin_memory=True)
+    else:
+        eval_dataloader = None
+
+    (model, optim) = strategy.prepare((model, optim))
+    trainer = SFTTrainer(model=model,
+                         strategy=strategy,
+                         optim=optim,
+                         train_dataloader=train_dataloader,
+                         eval_dataloader=eval_dataloader,
+                         max_epochs=args.max_epochs,
+                         accumulation_steps=args.accumulation_steps)
+
+    trainer.fit(logger=logger, use_wandb=args.use_wandb)
+
+    # save model checkpoint after fitting on only rank0
+    strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer)
+    # save optimizer checkpoint on all ranks
+    if args.need_optim_ckpt:
+        strategy.save_optimizer(trainer.optimizer,
+                                'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()),
+                                only_rank0=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--strategy',
+                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_zero2_cpu'],
+                        default='colossalai_zero2')
+    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--dataset', type=str, default=None)
+    parser.add_argument('--max_datasets_size', type=int, default=None)
+    parser.add_argument('--save_path', type=str, default='output')
+    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
+    parser.add_argument('--max_epochs', type=int, default=3)
+    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument('--max_len', type=int, default=512)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
+    parser.add_argument('--lr', type=float, default=5e-6)
+    parser.add_argument('--accumulation_steps', type=int, default=8)
+    parser.add_argument('--use_wandb', default=False, action='store_true')
+    parser.add_argument('--grad_checkpoint', default=False, action='store_true')
+    args = parser.parse_args()
+    train(args)
--- a/applications/Chat/examples/train_sft.sh
+++ b/applications/Chat/examples/train_sft.sh
+torchrun --standalone --nproc_per_node=4 train_sft.py \
+    --pretrain "/path/to/LLaMa-7B/" \
+    --model 'llama' \
+    --strategy colossalai_zero2 \
+    --log_interval 10 \
+    --save_path  /path/to/Coati-7B \
+    --dataset /path/to/data.json \
+    --batch_size 4 \
+    --accumulation_steps 8 \
+    --lr 2e-5 \
+    --max_datasets_size 512 \
+    --max_epochs 1 \
--- a/applications/Chat/inference/README.md
+++ b/applications/Chat/inference/README.md
+# Inference
+
+We provide an online inference server and a benchmark. We aim to run inference on single GPU, so quantization is essential when using large models.
+
+We support 8-bit quantization (RTN), which is powered by [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [transformers](https://github.com/huggingface/transformers). And 4-bit quantization (GPTQ), which is powered by [gptq](https://github.com/IST-DASLab/gptq) and [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa). We also support FP16 inference.
+
+We only support LLaMA family models now.
+
+## Choosing precision (quantization)
+
+**FP16**: Fastest, best output quality, highest memory usage
+
+**8-bit**: Slow, easier setup (originally supported by transformers), lower output quality (due to RTN), **recommended for first-timers**
+
+**4-bit**: Faster, lowest memory usage, higher output quality (due to GPTQ), but more difficult setup
+
+## Hardware requirements for LLaMA
+
+Tha data is from [LLaMA Int8 4bit ChatBot Guide v2](https://rentry.org/llama-tard-v2).
+
+### 8-bit
+
+| Model | Min GPU RAM | Recommended GPU RAM | Min RAM/Swap | Card examples |
+| :---: | :---: | :---: | :---: | :---: |
+| LLaMA-7B | 9.2GB | 10GB | 24GB | 3060 12GB, RTX 3080 10GB, RTX 3090 |
+| LLaMA-13B | 16.3GB | 20GB | 32GB | RTX 3090 Ti, RTX 4090 |
+| LLaMA-30B | 36GB | 40GB | 64GB | A6000 48GB, A100 40GB |
+| LLaMA-65B | 74GB | 80GB | 128GB | A100 80GB |
+
+### 4-bit
+
+| Model | Min GPU RAM | Recommended GPU RAM | Min RAM/Swap | Card examples |
+| :---: | :---: | :---: | :---: | :---: |
+| LLaMA-7B | 3.5GB | 6GB | 16GB | RTX 1660, 2060, AMD 5700xt, RTX 3050, 3060 |
+| LLaMA-13B | 6.5GB | 10GB | 32GB | AMD 6900xt, RTX 2060 12GB, 3060 12GB, 3080, A2000 |
+| LLaMA-30B | 15.8GB | 20GB | 64GB | RTX 3080 20GB, A4500, A5000, 3090, 4090, 6000, Tesla V100 |
+| LLaMA-65B | 31.2GB | 40GB | 128GB | A100 40GB, 2x3090, 2x4090, A40, RTX A6000, 8000, Titan Ada |
+
+## General setup
+
+```shell
+pip install -r requirements.txt
+```
+
+## 8-bit setup
+
+8-bit quantization is originally supported by the latest [transformers](https://github.com/huggingface/transformers). Please install it from source.
+
+Please ensure you have downloaded HF-format model weights of LLaMA models.
+
+Usage:
+
+```python
+import torch
+from transformers import LlamaForCausalLM
+
+USE_8BIT = True # use 8-bit quantization; otherwise, use fp16
+
+model = LlamaForCausalLM.from_pretrained(
+            "pretrained/path",
+            load_in_8bit=USE_8BIT,
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+if not USE_8BIT:
+    model.half()  # use fp16
+model.eval()
+```
+
+**Troubleshooting**: if you get error indicating your CUDA-related libraries not found when loading 8-bit model, you can check whether your `LD_LIBRARY_PATH` is correct.
+
+E.g. you can set `export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH`.
+
+## 4-bit setup
+
+Please ensure you have downloaded HF-format model weights of LLaMA models first.
+
+Then you can follow [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa). This lib provides efficient CUDA kernels and weight convertion script.
+
+After installing this lib, we may convert the original HF-format LLaMA model weights to 4-bit version.
+
+```shell
+CUDA_VISIBLE_DEVICES=0 python llama.py /path/to/pretrained/llama-7b c4 --wbits 4 --groupsize 128 --save llama7b-4bit.pt
+```
+
+Run this command in your cloned `GPTQ-for-LLaMa` directory, then you will get a 4-bit weight file `llama7b-4bit-128g.pt`.
+
+**Troubleshooting**: if you get error about `position_ids`, you can checkout to commit `50287c3b9ae4a3b66f6b5127c643ec39b769b155`(`GPTQ-for-LLaMa` repo).
+
+## Online inference server
+
+In this directory:
+
+```shell
+export CUDA_VISIBLE_DEVICES=0
+# fp16, will listen on 0.0.0.0:7070 by default
+python server.py /path/to/pretrained
+# 8-bit, will listen on localhost:8080
+python server.py /path/to/pretrained --quant 8bit --http_host localhost --http_port 8080
+# 4-bit
+python server.py /path/to/pretrained --quant 4bit --gptq_checkpoint /path/to/llama7b-4bit-128g.pt --gptq_group_size 128
+```
+
+## Benchmark
+
+In this directory:
+
+```shell
+export CUDA_VISIBLE_DEVICES=0
+# fp16
+python benchmark.py /path/to/pretrained
+# 8-bit
+python benchmark.py /path/to/pretrained --quant 8bit
+# 4-bit
+python benchmark.py /path/to/pretrained --quant 4bit --gptq_checkpoint /path/to/llama7b-4bit-128g.pt --gptq_group_size 128
+```
+
+This benchmark will record throughput and peak CUDA memory usage.
--- a/applications/Chat/inference/benchmark.py
+++ b/applications/Chat/inference/benchmark.py
+# Adapted from https://github.com/tloen/alpaca-lora/blob/main/generate.py
+
+import argparse
+from time import time
+
+import torch
+from llama_gptq import load_quant
+from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM
+
+
+def generate_prompt(instruction, input=None):
+    if input:
+        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+### Instruction:
+{instruction}
+
+### Input:
+{input}
+
+### Response:"""
+    else:
+        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{instruction}
+
+### Response:"""
+
+
+@torch.no_grad()
+def evaluate(
+    model,
+    tokenizer,
+    instruction,
+    input=None,
+    temperature=0.1,
+    top_p=0.75,
+    top_k=40,
+    num_beams=4,
+    max_new_tokens=128,
+    **kwargs,
+):
+    prompt = generate_prompt(instruction, input)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].cuda()
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_beams=num_beams,
+        **kwargs,
+    )
+    generation_output = model.generate(
+        input_ids=input_ids,
+        generation_config=generation_config,
+        return_dict_in_generate=True,
+        output_scores=True,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+    )
+    s = generation_output.sequences[0]
+    output = tokenizer.decode(s)
+    n_new_tokens = s.size(0) - input_ids.size(1)
+    return output.split("### Response:")[1].strip(), n_new_tokens
+
+
+instructions = [
+    "Tell me about alpacas.",
+    "Tell me about the president of Mexico in 2019.",
+    "Tell me about the king of France in 2019.",
+    "List all Canadian provinces in alphabetical order.",
+    "Write a Python program that prints the first 10 Fibonacci numbers.",
+    "Write a program that prints the numbers from 1 to 100. But for multiples of three print 'Fizz' instead of the number and for the multiples of five print 'Buzz'. For numbers which are multiples of both three and five print 'FizzBuzz'.",
+    "Tell me five words that rhyme with 'shock'.",
+    "Translate the sentence 'I have no mouth but I must scream' into Spanish.",
+    "Count up from 1 to 500.",
+    # ===
+    "How to play support in legends of league",
+    "Write a Python program that calculate Fibonacci numbers.",
+]
+inst = [instructions[0]] * 4
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'pretrained',
+        help='Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.')
+    parser.add_argument('--quant',
+                        choices=['8bit', '4bit'],
+                        default=None,
+                        help='Quantization mode. Default: None (no quantization, fp16).')
+    parser.add_argument(
+        '--gptq_checkpoint',
+        default=None,
+        help='Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.')
+    parser.add_argument('--gptq_group_size',
+                        type=int,
+                        default=128,
+                        help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.')
+    args = parser.parse_args()
+
+    if args.quant == '4bit':
+        assert args.gptq_checkpoint is not None, 'Please specify a GPTQ checkpoint.'
+
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
+
+    if args.quant == '4bit':
+        model = load_quant(args.pretrained, args.gptq_checkpoint, 4, args.gptq_group_size)
+        model.cuda()
+    else:
+        model = LlamaForCausalLM.from_pretrained(
+            args.pretrained,
+            load_in_8bit=(args.quant == '8bit'),
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+        if args.quant != '8bit':
+            model.half()    # seems to fix bugs for some users.
+        model.eval()
+
+    total_tokens = 0
+    start = time()
+    for instruction in instructions:
+        print(f"Instruction: {instruction}")
+        resp, tokens = evaluate(model, tokenizer, instruction, temparature=0.2, num_beams=1)
+        total_tokens += tokens
+        print(f"Response: {resp}")
+        print('\n----------------------------\n')
+    duration = time() - start
+    print(f'Total time: {duration:.3f} s, {total_tokens/duration:.3f} tokens/s')
+    print(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB')
--- a/applications/Chat/inference/llama_gptq/__init__.py
+++ b/applications/Chat/inference/llama_gptq/__init__.py
+from .loader import load_quant
+
+__all__ = [
+    'load_quant',
+]
--- a/applications/Chat/inference/llama_gptq/loader.py
+++ b/applications/Chat/inference/llama_gptq/loader.py
+import torch
+import torch.nn as nn
+import transformers
+from transformers import LlamaConfig, LlamaForCausalLM
+
+from .model_utils import find_layers
+from .quant import make_quant
+
+
+def load_quant(pretrained: str, checkpoint: str, wbits: int, groupsize: int):
+    config = LlamaConfig.from_pretrained(pretrained)
+
+    def noop(*args, **kwargs):
+        pass
+
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = LlamaForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant(model, layers, wbits, groupsize)
+
+    print(f'Loading model with {wbits} bits...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done.')
+
+    return model
--- a/applications/Chat/inference/llama_gptq/model_utils.py
+++ b/applications/Chat/inference/llama_gptq/model_utils.py
+# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/modelutils.py
+
+import torch
+import torch.nn as nn
+
+
+def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+    if type(module) in layers:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
+    return res
--- a/applications/Chat/inference/llama_gptq/quant.py
+++ b/applications/Chat/inference/llama_gptq/quant.py
+# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/quant.py
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def quantize(x, scale, zero, maxq):
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+class Quantizer(nn.Module):
+
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer('maxq', torch.tensor(0))
+        self.register_buffer('scale', torch.zeros(shape))
+        self.register_buffer('zero', torch.zeros(shape))
+
+    def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        self.scale = (xmax - xmin) / self.maxq
+        if self.sym:
+            self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+        else:
+            self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float('inf'), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    def quantize(self, x):
+        if self.ready():
+            return quantize(x, self.scale, self.zero, self.maxq)
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+
+try:
+    import quant_cuda
+except:
+    print('CUDA extension not installed.')
+
+# Assumes layer is perfectly divisible into 256 * 256 blocks
+
+
+class QuantLinear(nn.Module):
+
+    def __init__(self, bits, groupsize, infeatures, outfeatures):
+        super().__init__()
+        if bits not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        if groupsize != -1 and groupsize < 32 and groupsize != int(math.pow(2, int(math.log2(groupsize)))):
+            raise NotImplementedError("groupsize supports powers of 2 greater than 32. (e.g. : 32,64,128,etc)")
+        groupsize = groupsize if groupsize != -1 else infeatures
+        self.groupsize = groupsize
+        self.register_buffer(
+            'qzeros', torch.zeros((math.ceil(infeatures / groupsize), outfeatures // 256 * (bits * 8)),
+                                  dtype=torch.int))
+        self.register_buffer('scales', torch.zeros((math.ceil(infeatures / groupsize), outfeatures)))
+        self.register_buffer('bias', torch.zeros(outfeatures))
+        self.register_buffer('qweight', torch.zeros((infeatures // 256 * (bits * 8), outfeatures), dtype=torch.int))
+        self._initialized_quant_state = False
+
+    def pack(self, linear, scales, zeros):
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            g_idx = idx // self.groupsize
+            intweight.append(
+                torch.round((linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:,
+                                                                                                                  None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros((intweight.shape[0] // 256 * (self.bits * 8), intweight.shape[1]), dtype=np.uint32)
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i))
+                i += 10
+                qweight[row] |= intweight[i] << 30
+                row += 1
+                qweight[row] |= (intweight[i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+                i += 10
+                qweight[row] |= intweight[i] << 31
+                row += 1
+                qweight[row] |= (intweight[i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+                i += 10
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (self.bits * 8)), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 30
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 31
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+                i += 10
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        intermediate_dtype = torch.float32
+
+        if not self._initialized_quant_state:
+            # Do we even have a bias? Check for at least one non-zero element.
+            if self.bias is not None and bool(torch.any(self.bias != 0)):
+                # Then make sure it's the right type.
+                self.bias.data = self.bias.data.to(intermediate_dtype)
+            else:
+                self.bias = None
+
+        outshape = list(x.shape)
+        outshape[-1] = self.outfeatures
+        x = x.reshape(-1, x.shape[-1])
+        if self.bias is None:
+            y = torch.zeros(x.shape[0], outshape[-1], dtype=intermediate_dtype, device=x.device)
+        else:
+            y = self.bias.clone().repeat(x.shape[0], 1)
+
+        output_dtype = x.dtype
+        x = x.to(intermediate_dtype)
+        if self.bits == 2:
+            quant_cuda.vecquant2matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 3:
+            quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 4:
+            quant_cuda.vecquant4matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 8:
+            quant_cuda.vecquant8matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        else:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        y = y.to(output_dtype)
+        return y.reshape(outshape)
+
+
+def make_quant(module, names, bits, groupsize, name=''):
+    if isinstance(module, QuantLinear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+        if name1 in names:
+            setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features))
+    for name1, child in module.named_children():
+        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
--- a/applications/Chat/inference/locustfile.py
+++ b/applications/Chat/inference/locustfile.py
+from json import JSONDecodeError
+
+from locust import HttpUser, task
+
+samples = [[
+    dict(
+        instruction='Who is the best player in the history of NBA?',
+        response=
+        'The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
+    ),
+    dict(instruction='continue this talk', response=''),
+], [
+    dict(instruction='Who is the best player in the history of NBA?', response=''),
+]]
+
+
+class GenerationUser(HttpUser):
+
+    @task
+    def generate(self):
+        for sample in samples:
+            data = {'max_new_tokens': 64, 'history': sample}
+            with self.client.post('/generate', json=data, catch_response=True) as response:
+                if response.status_code in (200, 406):
+                    response.success()
+                else:
+                    response.failure('Response wrong')
--- a/applications/Chat/inference/requirements.txt
+++ b/applications/Chat/inference/requirements.txt
+fastapi
+locust
+numpy
+pydantic
+safetensors
+slowapi
+sse_starlette
+torch
+uvicorn
+git+https://github.com/huggingface/transformers
+accelerate
+bitsandbytes
+jieba
\ No newline at end of file
--- a/applications/Chat/inference/server.py
+++ b/applications/Chat/inference/server.py
+import argparse
+import os
+from threading import Lock
+from typing import Dict, Generator, List, Optional
+
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from llama_gptq import load_quant
+from pydantic import BaseModel, Field
+from slowapi import Limiter, _rate_limit_exceeded_handler
+from slowapi.errors import RateLimitExceeded
+from slowapi.util import get_remote_address
+from sse_starlette.sse import EventSourceResponse
+from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM
+from utils import ChatPromptProcessor, Dialogue, LockedIterator, sample_streamingly, update_model_kwargs_fn, load_json
+
+CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.'
+MAX_LEN = 512
+running_lock = Lock()
+
+
+class GenerationTaskReq(BaseModel):
+    max_new_tokens: int = Field(gt=0, le=512, example=64)
+    history: List[Dialogue] = Field(min_items=1)
+    top_k: Optional[int] = Field(default=None, gt=0, example=50)
+    top_p: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.5)
+    temperature: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.7)
+    repetition_penalty: Optional[float] = Field(default=None, gt=1.0, example=1.2)
+
+
+limiter = Limiter(key_func=get_remote_address)
+app = FastAPI()
+app.state.limiter = limiter
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+
+# set CORS
+origin_spec_from_env = os.environ.get('CORS_ORIGIN', None)
+
+if origin_spec_from_env is not None:
+    # allow CORS from the specified origins
+    origins = os.environ['CORS_ORIGIN'].split(',')
+else:
+    # allow CORS from all origins
+    origins = ["*"]
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature):
+    inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
+    #TODO(ver217): streaming generation does not support repetition_penalty now
+    model_kwargs = {
+        'max_generate_tokens': max_new_tokens,
+        'early_stopping': True,
+        'top_k': top_k,
+        'top_p': top_p,
+        'temperature': temperature,
+        'prepare_inputs_fn': model.prepare_inputs_for_generation,
+        'update_model_kwargs_fn': update_model_kwargs_fn,
+    }
+    is_first_word = True
+    generator = LockedIterator(sample_streamingly(model, **inputs, **model_kwargs), running_lock)
+    for output in generator:
+        output = output.cpu()
+        tokens = tokenizer.convert_ids_to_tokens(output, skip_special_tokens=True)
+        current_sub_tokens = []
+        for token in tokens:
+            if token in tokenizer.all_special_tokens:
+                continue
+            current_sub_tokens.append(token)
+        if current_sub_tokens:
+            out_string = tokenizer.sp_model.decode(current_sub_tokens)
+            if is_first_word:
+                out_string = out_string.lstrip()
+                is_first_word = False
+            elif current_sub_tokens[0].startswith('▁'):
+                # whitespace will be ignored by the frontend
+                out_string = ' ' + out_string
+            yield out_string
+
+
+async def event_generator(request: Request, generator: Generator):
+    while True:
+        if await request.is_disconnected():
+            break
+        try:
+            yield {'event': 'generate', 'data': next(generator)}
+        except StopIteration:
+            yield {'event': 'end', 'data': ''}
+            break
+
+
+@app.post('/generate/stream')
+@limiter.limit('1/second')
+def generate(data: GenerationTaskReq, request: Request):
+    prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
+    event_source = event_generator(
+        request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature))
+    return EventSourceResponse(event_source)
+
+
+@app.post('/generate')
+@limiter.limit('1/second')
+def generate_no_stream(data: GenerationTaskReq, request: Request):
+    prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
+    if prompt_processor.has_censored_words(prompt):
+        return prompt_processor.SAFE_RESPONSE
+    inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
+    with running_lock:
+        output = model.generate(**inputs, **data.dict(exclude={'history'}))
+    output = output.cpu()
+    prompt_len = inputs['input_ids'].size(1)
+    response = output[0, prompt_len:]
+    out_string = tokenizer.decode(response, skip_special_tokens=True)
+    out_string = prompt_processor.postprocess_output(out_string)
+    if prompt_processor.has_censored_words(out_string):
+        return prompt_processor.SAFE_RESPONSE
+    return out_string
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'pretrained',
+        help='Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.')
+    parser.add_argument('--quant',
+                        choices=['8bit', '4bit'],
+                        default=None,
+                        help='Quantization mode. Default: None (no quantization, fp16).')
+    parser.add_argument(
+        '--gptq_checkpoint',
+        default=None,
+        help='Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.')
+    parser.add_argument('--gptq_group_size',
+                        type=int,
+                        default=128,
+                        help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.')
+    parser.add_argument('--http_host', default='0.0.0.0')
+    parser.add_argument('--http_port', type=int, default=7070)
+    parser.add_argument('--profanity_file', default=None, help='Path to profanity words list. It should be a JSON file containing a list of words.')
+    args = parser.parse_args()
+
+    if args.quant == '4bit':
+        assert args.gptq_checkpoint is not None, 'Please specify a GPTQ checkpoint.'
+
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
+
+    if args.profanity_file is not None:
+        censored_words = load_json(args.profanity_file)
+    else:
+        censored_words = []
+    prompt_processor = ChatPromptProcessor(tokenizer, CONTEXT, MAX_LEN, censored_words=censored_words)
+
+    if args.quant == '4bit':
+        model = load_quant(args.pretrained, args.gptq_checkpoint, 4, args.gptq_group_size)
+        model.cuda()
+    else:
+        model = LlamaForCausalLM.from_pretrained(
+            args.pretrained,
+            load_in_8bit=(args.quant == '8bit'),
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+        if args.quant != '8bit':
+            model.half()    # seems to fix bugs for some users.
+        model.eval()
+
+    config = uvicorn.Config(app, host=args.http_host, port=args.http_port)
+    server = uvicorn.Server(config=config)
+    server.run()