[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format

[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
079bf3cb · Hongxin Liu · GitHub · 3c6b831c · 079bf3cb · 079bf3cb
Unverified Commit 079bf3cb authored Sep 19, 2023 by Hongxin Liu Committed by GitHub Sep 19, 2023
20 changed files
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -39,8 +39,9 @@ class SFTTrainer(SLTrainer):
        accumulation_steps: int = 8,
    ) -> None:
        if accumulation_steps > 1:
-            assert not isinstance(strategy, GeminiStrategy), \
-                "Accumulation steps are not supported in stage 3 of ColossalAI"
+            assert not isinstance(
+                strategy, GeminiStrategy
+            ), "Accumulation steps are not supported in stage 3 of ColossalAI"

        super().__init__(strategy, max_epochs, model, optim)

@@ -50,15 +51,11 @@ class SFTTrainer(SLTrainer):
    def _train(self, epoch: int):
        self.model.train()
        for batch_id, batch in enumerate(self.train_dataloader):
-
            batch = to_device(batch, torch.cuda.current_device())
            if "attention_mask" in batch:
-                outputs = self.model(batch["input_ids"],
-                                    attention_mask=batch["attention_mask"],
-                                    labels=batch["labels"])
+                outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
            else:
-                outputs = self.model(batch["input_ids"],
-                                    labels=batch["labels"])
+                outputs = self.model(batch["input_ids"], labels=batch["labels"])

            loss = outputs.loss
            loss = loss / self.accumulation_steps
@@ -73,12 +70,14 @@ class SFTTrainer(SLTrainer):
                self.optimizer.zero_grad()
                self.scheduler.step()
                if is_rank_0() and self.use_wandb:
-                    wandb.log({
-                        "loss": self.total_loss / self.accumulation_steps,
-                        "lr": self.scheduler.get_last_lr()[0],
-                        "epoch": epoch,
-                        "batch_id": batch_id
-                    })
+                    wandb.log(
+                        {
+                            "loss": self.total_loss / self.accumulation_steps,
+                            "lr": self.scheduler.get_last_lr()[0],
+                            "epoch": epoch,
+                            "batch_id": batch_id,
+                        }
+                    )
                self.total_loss = 0
                self.step_bar.update()

@@ -89,9 +88,9 @@ class SFTTrainer(SLTrainer):
                loss_sum, num_seen = 0, 0
                for batch in self.eval_dataloader:
                    batch = to_device(batch, torch.cuda.current_device())
-                    outputs = self.model(batch["input_ids"],
-                                         attention_mask=batch["attention_mask"],
-                                         labels=batch["labels"])
+                    outputs = self.model(
+                        batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"]
+                    )
                    loss = outputs.loss

                    loss_sum += loss.item()
@@ -99,13 +98,15 @@ class SFTTrainer(SLTrainer):

                loss_mean = loss_sum / num_seen
                if dist.get_rank() == 0:
-                    self.logger.info(f'Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}')
+                    self.logger.info(f"Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}")

-    def _before_fit(self,
-                    train_dataloader: DataLoader,
-                    eval_dataloader: Optional[DataLoader] = None,
-                    logger: Optional[DistributedLogger] = None,
-                    use_wandb: bool = False):
+    def _before_fit(
+        self,
+        train_dataloader: DataLoader,
+        eval_dataloader: Optional[DataLoader] = None,
+        logger: Optional[DistributedLogger] = None,
+        use_wandb: bool = False,
+    ):
        """
        Args:
            train_dataloader: the dataloader to use for training
@@ -124,6 +125,6 @@ class SFTTrainer(SLTrainer):
        self.no_epoch_bar = True
        self.step_bar = tqdm.trange(
            len(self.train_dataloader) // self.accumulation_steps * self.max_epochs,
-            desc=f'steps',
-            disable=not is_rank_0()
+            desc=f"steps",
+            disable=not is_rank_0(),
        )
--- a/applications/Chat/coati/trainer/strategies/__init__.py
+++ b/applications/Chat/coati/trainer/strategies/__init__.py
@@ -2,7 +2,4 @@ from .base import Strategy
 from .colossalai import GeminiStrategy, LowLevelZeroStrategy
 from .ddp import DDPStrategy

-__all__ = [
-    'Strategy', 'DDPStrategy',
-    'LowLevelZeroStrategy', 'GeminiStrategy'
-]
+__all__ = ["Strategy", "DDPStrategy", "LowLevelZeroStrategy", "GeminiStrategy"]
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -19,7 +19,7 @@ _BoostArgSpec = Union[nn.Module, Tuple[nn.Module, Optimizer], Dict]

 class Strategy(ABC):
    """
-        Base class for training strategies.
+    Base class for training strategies.
    """

    def __init__(self, plugin_initializer: Callable[..., Optional[Plugin]] = lambda: None) -> None:
@@ -83,16 +83,18 @@ class Strategy(ABC):
                rets.append((model, optimizer))
            elif isinstance(arg, Dict):
                model, optimizer, criterion, dataloader, lr_scheduler = self.booster.boost(**arg)
-                boost_result = dict(model=model,
-                                    optimizer=optimizer,
-                                    criterion=criterion,
-                                    dataloader=dataloader,
-                                    lr_scheduler=lr_scheduler)
+                boost_result = dict(
+                    model=model,
+                    optimizer=optimizer,
+                    criterion=criterion,
+                    dataloader=dataloader,
+                    lr_scheduler=lr_scheduler,
+                )
                # remove None values
                boost_result = {key: value for key, value in boost_result.items() if value is not None}
                rets.append(boost_result)
            else:
-                raise RuntimeError(f'Type {type(arg)} is not supported')
+                raise RuntimeError(f"Type {type(arg)} is not supported")

        return rets[0] if len(rets) == 1 else rets

@@ -125,11 +127,9 @@ class Strategy(ABC):
        return DistributedSampler(dataset, 1, 0)

    @abstractmethod
-    def save_pretrained(self,
-                        model: nn.Module,
-                        path: str,
-                        only_rank0: bool = True,
-                        tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+    def save_pretrained(
+        self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
+    ) -> None:
        pass

    @abstractmethod

--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -42,27 +42,27 @@ class LowLevelZeroStrategy(DDPStrategy):

    """

-    def __init__(self,
-                 stage: int = 2,
-                 precision: str = 'fp16',
-                 seed: int = 42,
-                 placement_policy: str = 'cuda',
-                 reduce_bucket_size: int = 12 * 1024**2,    # only for stage 1&2
-                 overlap_communication: bool = True,    # only for stage 1&2
-                 initial_scale: float = 2**16,
-                 growth_factor: float = 2,
-                 backoff_factor: float = 0.5,
-                 growth_interval: int = 1000,
-                 hysteresis: int = 2,
-                 min_scale: float = 1,
-                 max_scale: float = 2**32,
-                 max_norm: float = 0.0,
-                 norm_type: float = 2.0
-                 ) -> None:
-
+    def __init__(
+        self,
+        stage: int = 2,
+        precision: str = "fp16",
+        seed: int = 42,
+        placement_policy: str = "cuda",
+        reduce_bucket_size: int = 12 * 1024**2,  # only for stage 1&2
+        overlap_communication: bool = True,  # only for stage 1&2
+        initial_scale: float = 2**16,
+        growth_factor: float = 2,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 1000,
+        hysteresis: int = 2,
+        min_scale: float = 1,
+        max_scale: float = 2**32,
+        max_norm: float = 0.0,
+        norm_type: float = 2.0,
+    ) -> None:
        assert stage in (1, 2), f'Unsupported stage "{stage}"'
-        assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
-        assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
+        assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
+        assert precision in ("fp32", "fp16"), f'Unsupported precision "{precision}"'

        plugin_initializer = lambda: LowLevelZeroPlugin(
            # zero_config
@@ -71,7 +71,7 @@ class LowLevelZeroStrategy(DDPStrategy):
            # zero_optim_config
            reduce_bucket_size_in_m=reduce_bucket_size,
            overlap_communication=overlap_communication,
-            cpu_offload=(placement_policy == 'cpu'),
+            cpu_offload=(placement_policy == "cpu"),
            # optim_config
            initial_scale=initial_scale,
            growth_factor=growth_factor,
@@ -81,14 +81,15 @@ class LowLevelZeroStrategy(DDPStrategy):
            min_scale=min_scale,
            max_scale=max_scale,
            max_norm=max_norm,
-            norm_type=norm_type
+            norm_type=norm_type,
        )

        super().__init__(seed, plugin_initializer)

    def _post_init(self) -> None:
-        assert isinstance(self.plugin, LowLevelZeroPlugin), \
-            f'{type(self).__name__}\'s plugin is not initialized properly.'
+        assert isinstance(
+            self.plugin, LowLevelZeroPlugin
+        ), f"{type(self).__name__}'s plugin is not initialized properly."

    def setup_distributed(self) -> None:
        colossalai.launch_from_torch({}, seed=self.seed)
@@ -131,45 +132,45 @@ class GeminiStrategy(DDPStrategy):

    """

-    def __init__(self,
-                 seed: int = 42,
-                 shard_init: bool = False,    # only for stage 3
-                 placement_policy: str = 'cuda',
-                 pin_memory: bool = True,    # only for stage 3
-                 force_outputs_fp32: bool = False,    # only for stage 3
-                 search_range_m: int = 32,    # only for stage 3
-                 hidden_dim: Optional[int] = None,    # only for stage 3
-                 min_chunk_size_m: float = 32,    # only for stage 3
-                 gpu_margin_mem_ratio: float = 0.0,    # only for stage 3
-                 initial_scale: float = 2**16,
-                 growth_factor: float = 2,
-                 backoff_factor: float = 0.5,
-                 growth_interval: int = 1000,
-                 hysteresis: int = 2,
-                 min_scale: float = 1,
-                 max_scale: float = 2**32,
-                 max_norm: float = 0.0,
-                 norm_type: float = 2.0
-                 ) -> None:
-
-        assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
+    def __init__(
+        self,
+        seed: int = 42,
+        shard_init: bool = False,  # only for stage 3
+        placement_policy: str = "cuda",
+        pin_memory: bool = True,  # only for stage 3
+        force_outputs_fp32: bool = False,  # only for stage 3
+        search_range_m: int = 32,  # only for stage 3
+        hidden_dim: Optional[int] = None,  # only for stage 3
+        min_chunk_size_m: float = 32,  # only for stage 3
+        gpu_margin_mem_ratio: float = 0.0,  # only for stage 3
+        initial_scale: float = 2**16,
+        growth_factor: float = 2,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 1000,
+        hysteresis: int = 2,
+        min_scale: float = 1,
+        max_scale: float = 2**32,
+        max_norm: float = 0.0,
+        norm_type: float = 2.0,
+    ) -> None:
+        assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'

        # TODO(ver217): support shard_init when using from_pretrained()
        if shard_init:
            warnings.warn(
-                f'Shard init is not supported model.from_pretrained() yet. '
-                'Please load weights after strategy.prepare()'
+                f"Shard init is not supported model.from_pretrained() yet. "
+                "Please load weights after strategy.prepare()"
            )
        self.shard_init = shard_init

-        warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
+        warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")

        # NOTE: dist should be initialized before calling get_current_device()
        plugin_initializer = lambda: GeminiPlugin(
            # gemini_config
            device=get_current_device(),
            placement_policy=placement_policy,
-            precision='fp16',
+            precision="fp16",
            pin_memory=pin_memory,
            force_outputs_fp32=force_outputs_fp32,
            strict_ddp_mode=shard_init,
@@ -187,14 +188,13 @@ class GeminiStrategy(DDPStrategy):
            min_scale=min_scale,
            max_scale=max_scale,
            max_norm=max_norm,
-            norm_type=norm_type
+            norm_type=norm_type,
        )

        super().__init__(seed, plugin_initializer)

    def _post_init(self) -> None:
-        assert isinstance(self.plugin, GeminiPlugin), \
-            f'{type(self).__name__}\'s plugin is not initialized properly.'
+        assert isinstance(self.plugin, GeminiPlugin), f"{type(self).__name__}'s plugin is not initialized properly."

    def setup_distributed(self) -> None:
        colossalai.launch_from_torch({}, seed=self.seed)
@@ -203,10 +203,9 @@ class GeminiStrategy(DDPStrategy):
        world_size = dist.get_world_size()
        shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
        default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
-        return ColoInitContext(device=get_current_device(),
-                               dtype=torch.half,
-                               default_pg=shard_pg,
-                               default_dist_spec=default_dist_spec)
+        return ColoInitContext(
+            device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec
+        )

    def unwrap_model(self, model: nn.Module) -> nn.Module:
        assert isinstance(model, GeminiModel)

--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -31,24 +31,21 @@ def get_grad_required_state_dict(model: nn.Module):

 class DDPStrategy(Strategy):
    """
-        Strategy for distributed training using torch.distributed.
+    Strategy for distributed training using torch.distributed.
    """

-    def __init__(self,
-                 seed: int = 42,
-                 plugin_initializer: Callable = TorchDDPPlugin
-                 ) -> None:
+    def __init__(self, seed: int = 42, plugin_initializer: Callable = TorchDDPPlugin) -> None:
        self.seed = seed
        super().__init__(plugin_initializer)

    def _try_init_dist(self, force: bool = False) -> None:
        try:
-            rank = int(os.environ['RANK'])
-            local_rank = int(os.environ['LOCAL_RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            host = os.environ['MASTER_ADDR']
-            port = int(os.environ['MASTER_PORT'])
-            dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+            rank = int(os.environ["RANK"])
+            local_rank = int(os.environ["LOCAL_RANK"])
+            world_size = int(os.environ["WORLD_SIZE"])
+            host = os.environ["MASTER_ADDR"]
+            port = int(os.environ["MASTER_PORT"])
+            dist.init_process_group("nccl", init_method=f"tcp://[{host}]:{port}", world_size=world_size, rank=rank)
            torch.cuda.set_device(local_rank)
        except KeyError as e:
            if force:
@@ -60,8 +57,7 @@ class DDPStrategy(Strategy):
                raise e

    def _post_init(self) -> None:
-        assert isinstance(self.plugin, TorchDDPPlugin), \
-            f'{type(self).__name__}\'s plugin is not initialized properly.'
+        assert isinstance(self.plugin, TorchDDPPlugin), f"{type(self).__name__}'s plugin is not initialized properly."

    def setup_distributed(self) -> None:
        self._try_init_dist(force=True)
@@ -73,12 +69,14 @@ class DDPStrategy(Strategy):
        torch.manual_seed(seed)

    def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = False) -> DataLoader:
-        return self.plugin.prepare_dataloader(data_buffer,
-                                              batch_size=data_buffer.sample_batch_size,
-                                              shuffle=True,
-                                              drop_last=True,
-                                              pin_memory=pin_memory,
-                                              collate_fn=data_buffer.collate_fn)
+        return self.plugin.prepare_dataloader(
+            data_buffer,
+            batch_size=data_buffer.sample_batch_size,
+            shuffle=True,
+            drop_last=True,
+            pin_memory=pin_memory,
+            collate_fn=data_buffer.collate_fn,
+        )

    def setup_sampler(self, dataset) -> DistributedSampler:
        # FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
@@ -88,11 +86,9 @@ class DDPStrategy(Strategy):
        assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel."
        return model.unwrap()

-    def save_pretrained(self,
-                        model: nn.Module,
-                        path: str,
-                        only_rank0: bool = True,
-                        tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+    def save_pretrained(
+        self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
+    ) -> None:
        if not only_rank0 or dist.get_rank() == 0:
            unwrapped_model = self.unwrap_model(model)
            assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
@@ -103,17 +99,11 @@ class DDPStrategy(Strategy):
            if tokenizer is not None:
                tokenizer.save_pretrained(path)
        model_path = os.path.join(path, "pytorch_model.bin")
-        self.save_model(model,
-                        model_path,
-                        only_rank0=only_rank0)
+        self.save_model(model, model_path, only_rank0=only_rank0)

-        def _replace_keys(model_path: str,
-                          replace_fn: Callable):
+        def _replace_keys(model_path: str, replace_fn: Callable):
            state_dict = torch.load(model_path, map_location="cpu")
-            state_dict = {
-                replace_fn(k): v
-                for k, v in state_dict.items()
-            }
+            state_dict = {replace_fn(k): v for k, v in state_dict.items()}
            torch.save(state_dict, model_path)

        # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
@@ -124,13 +114,13 @@ class DDPStrategy(Strategy):
    def get_model_state_dict_shard(self, model: nn.Module, **config):
        # TODO: implement sharding on naive strategy
        model = self.unwrap_model(model)
-        if 'requires_grad_only' in config and config['requires_grad_only'] == True:
+        if "requires_grad_only" in config and config["requires_grad_only"] == True:
            state_dict = get_grad_required_state_dict(model)
        else:
            state_dict = model.state_dict()

-        if 'shard_size' in config:
-            shard_size = config['shard_size']
+        if "shard_size" in config:
+            shard_size = config["shard_size"]
            accumulate_size = 0
            state_dict_shard = OrderedDict()
            for name, param in state_dict.items():

--- a/applications/Chat/coati/trainer/strategies/sampler.py
+++ b/applications/Chat/coati/trainer/strategies/sampler.py
@@ -4,7 +4,6 @@ import numpy as np


 class DistributedSampler:
-
    def __init__(self, dataset, num_replicas: int, rank: int) -> None:
        self.dataset = dataset
        self.num_replicas = num_replicas
@@ -12,7 +11,7 @@ class DistributedSampler:

        if len(self.dataset) % self.num_replicas != 0:
            self.num_samples = math.ceil(
-                (len(self.dataset) - self.num_replicas) / self.num_replicas    # type: ignore[arg-type]
+                (len(self.dataset) - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
            )
        else:
            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
@@ -20,10 +19,10 @@ class DistributedSampler:
        self.total_size = self.num_samples * self.num_replicas

        indices = list(range(len(self.dataset)))
-        indices = indices[:self.total_size]
+        indices = indices[: self.total_size]
        assert len(indices) == self.total_size
        # subsample
-        indices = indices[self.rank:self.total_size:self.num_replicas]
+        indices = indices[self.rank : self.total_size : self.num_replicas]
        assert len(indices) == self.num_samples
        self.indices = indices


--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@@ -42,7 +42,6 @@ def is_rank_0() -> bool:


 def to_device(x: Any, device: torch.device) -> Any:
-
    def _to(t: Any):
        if isinstance(t, torch.Tensor):
            return t.to(device)

--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
@@ -70,7 +70,7 @@
        "BLEU",
        "ROUGE",
        "BERTScore"
-      ]   
+      ]
    },
    "logical_reasoning": {
      "GPT": [
@@ -83,7 +83,7 @@
        "ROUGE",
        "BERTScore",
        "CHRF"
-      ]   
+      ]
    },
    "open_qa": {
      "GPT": [
@@ -126,7 +126,7 @@
        "conciseness"
      ],
      "Metrics": [
-      ]   
+      ]
    },
    "Finance": {
      "GPT": [
@@ -134,7 +134,7 @@
        "correctness"
      ],
      "Metrics": [
-      ] 
+      ]
    },
    "Law": {
      "GPT": [
@@ -142,7 +142,7 @@
        "correctness"
      ],
      "Metrics": [
-      ]   
+      ]
    },
    "Education": {
      "GPT": [
@@ -150,7 +150,7 @@
        "correctness"
      ],
      "Metrics": [
-      ]   
+      ]
    },
    "Medical": {
      "GPT": [
@@ -158,7 +158,7 @@
        "correctness"
      ],
      "Metrics": [
-      ]   
+      ]
    },
    "STEM": {
      "GPT": [
@@ -166,7 +166,7 @@
        "correctness"
      ],
      "Metrics": [
-      ]   
+      ]
    },
    "SocialScience": {
      "GPT": [
@@ -174,7 +174,7 @@
        "correctness"
      ],
      "Metrics": [
-      ]   
+      ]
    },
    "Humanity": {
      "GPT": [
@@ -182,7 +182,7 @@
        "correctness"
      ],
      "Metrics": [
-      ]   
+      ]
    },
    "Other": {
      "GPT": [
@@ -190,7 +190,7 @@
        "correctness"
      ],
      "Metrics": [
-      ]   
+      ]
    },
    "ethics": {
      "GPT": [
@@ -198,7 +198,7 @@
        "correctness"
      ],
      "Metrics": [
-      ]   
+      ]
    }
  }
 }
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
 import argparse
-import json
 import os

 import openai
@@ -9,7 +8,8 @@ from utils import jload

 def main(args):
    assert len(args.answer_file_list) == len(
-        args.model_name_list), "The number of answer files and model names should be equal!"
+        args.model_name_list
+    ), "The number of answer files and model names should be equal!"

    # load config
    config = jload(args.config_file)
@@ -36,7 +36,8 @@ def main(args):

        if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
            raise Exception(
-                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
+                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
+            )

        if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
            raise Exception(
@@ -44,8 +45,15 @@ def main(args):
            )

        # initialize evaluator
-        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
-                              config["language"], config.get("path_for_UniEval", None), args.gpt_with_reference)
+        evaluator = Evaluator(
+            metrics_per_category,
+            battle_prompt,
+            gpt_evaluation_prompt,
+            args.gpt_model,
+            config["language"],
+            config.get("path_for_UniEval", None),
+            args.gpt_with_reference,
+        )
        if len(args.model_name_list) == 2:
            answers1 = jload(args.answer_file_list[0])
            answers2 = jload(args.answer_file_list[1])
@@ -68,41 +76,41 @@ def main(args):
        raise ValueError(f'Unsupported language {config["language"]}!')


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
-    parser.add_argument('--config_file',
-                        type=str,
-                        default=None,
-                        required=True,
-                        help='path to the file of target results')
-    parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
-    parser.add_argument('--gpt_evaluation_prompt_file',
-                        type=str,
-                        default=None,
-                        help='path to the prompt file for gpt evaluation')
-    parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
-    parser.add_argument('--answer_file_list',
-                        type=str,
-                        nargs='+',
-                        default=[],
-                        required=True,
-                        help='path to the answer files of at most 2 models')
-    parser.add_argument('--model_name_list',
-                        type=str,
-                        nargs='+',
-                        default=[],
-                        required=True,
-                        help='the names of at most 2 models')
-    parser.add_argument('--gpt_model',
-                        default="gpt-3.5-turbo",
-                        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
-                        help='which GPT model to use for evaluation')
-    parser.add_argument('--gpt_with_reference',
-                        default=False,
-                        action="store_true",
-                        help='whether to include reference answer in gpt evaluation')
-    parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
-    parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
+    parser.add_argument(
+        "--config_file", type=str, default=None, required=True, help="path to the file of target results"
+    )
+    parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
+    parser.add_argument(
+        "--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
+    )
+    parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
+    parser.add_argument(
+        "--answer_file_list",
+        type=str,
+        nargs="+",
+        default=[],
+        required=True,
+        help="path to the answer files of at most 2 models",
+    )
+    parser.add_argument(
+        "--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
+    )
+    parser.add_argument(
+        "--gpt_model",
+        default="gpt-3.5-turbo",
+        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
+        help="which GPT model to use for evaluation",
+    )
+    parser.add_argument(
+        "--gpt_with_reference",
+        default=False,
+        action="store_true",
+        help="whether to include reference answer in gpt evaluation",
+    )
+    parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
+    parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
    args = parser.parse_args()

    if args.openai_key is not None:

--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -3,20 +3,27 @@ from typing import Any, Dict, List

 import gpt_evaluate
 import metrics
-import pandas as pd
 import unieval
 from utils import analyze_automatic_results, get_data_per_category, save_automatic_results


 class Evaluator(object):
    """
-        A class named Evaluator includes GPT-3.5/GPT-4 evaluation
-        and automatic evaluation
+    A class named Evaluator includes GPT-3.5/GPT-4 evaluation
+    and automatic evaluation

    """

-    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
-                 gpt_model: str, language: str, path_for_UniEval: Dict[str, str], gpt_with_reference: bool) -> None:
+    def __init__(
+        self,
+        params: Dict[str, Any],
+        battle_prompt: Dict[str, Any],
+        gpt_evaluation_prompt: Dict[str, Any],
+        gpt_model: str,
+        language: str,
+        path_for_UniEval: Dict[str, str],
+        gpt_with_reference: bool,
+    ) -> None:
        self.params = params
        self.battle_prompt = battle_prompt
        self.gpt_evaluation_prompt = gpt_evaluation_prompt
@@ -103,7 +110,8 @@ class Evaluator(object):

            if self.params[category]["UniEval"] and self.language == "cn":
                raise Exception(
-                    "UniEval doesn't support Chinese! Please remove UniEval config in your Chinese config file.")
+                    "UniEval doesn't support Chinese! Please remove UniEval config in your Chinese config file."
+                )

            category_metrics = self.params[category]["UniEval"]

@@ -134,10 +142,9 @@ class Evaluator(object):
                sources_list = [answer["instruction"] + answer["input"] for answer in answers_per_category[category]]

                data = unieval.convert_data_to_unieval_format(predicts_list, sources_list, targets_list)
-                scores = uni_evaluator.evaluate(data,
-                                                category,
-                                                dims=list(self.unieval_metric_stats[task][category].keys()),
-                                                overall=False)
+                scores = uni_evaluator.evaluate(
+                    data, category, dims=list(self.unieval_metric_stats[task][category].keys()), overall=False
+                )
                avg_scores = unieval.calculate_average_score(scores)

                self.unieval_metric_stats[task][category].update(avg_scores)
@@ -165,7 +172,8 @@ class Evaluator(object):
                category,
                self.gpt_model,
                self.language,
-                references=targets_per_category[category] if self.gpt_with_reference else None)
+                references=targets_per_category[category] if self.gpt_with_reference else None,
+            )

    def save(self, path: str, model_name_list: List[str]) -> None:
        """
@@ -204,16 +212,18 @@ class Evaluator(object):
                gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
                gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")

-                all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0],
-                                                                           self.gpt_evaluation_results,
-                                                                           gpt_evaluation_results_save_path)
+                all_evaluations = gpt_evaluate.save_gpt_evaluation_results(
+                    model_name_list[0], self.gpt_evaluation_results, gpt_evaluation_results_save_path
+                )

                # Start to calculate scores and save statistics.
                gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
-                gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
-                                                            gpt_evaluation_statistics_save_path)
+                gpt_evaluate.save_gpt_evaluation_statistics(
+                    model_name_list[0], all_evaluations, gpt_evaluation_statistics_save_path
+                )

                # Save charts and csv.
                gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
-                gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
-                                                               gpt_evaluation_analyses_save_path)
+                gpt_evaluate.analyze_gpt_evaluation_statistics(
+                    gpt_evaluation_statistics_save_path, gpt_evaluation_analyses_save_path
+                )
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -14,20 +14,18 @@ import tqdm
 from utils import jdump, jload

 ref_step_template = {
-    "en":
-        "Now please compare the answer with the {adjective} answer, determine whether the answer is able to achieve the same level of {metric}.\n\n",
-    "cn":
-        "请比较答案与上面的{adjective}答案，确定答案是否可以达到与该{adjective}答案同样水平的{metric}。\n\n"
+    "en": "Now please compare the answer with the {adjective} answer, determine whether the answer is able to achieve the same level of {metric}.\n\n",
+    "cn": "请比较答案与上面的{adjective}答案，确定答案是否可以达到与该{adjective}答案同样水平的{metric}。\n\n",
 }

 ref_answer_template_general = {
    "en": "\nAn example answer with good quality is as follows:\n\n{answer}\n\n",
-    "cn": "\n一个优质的示例答案如下：\n\n{answer}\n\n"
+    "cn": "\n一个优质的示例答案如下：\n\n{answer}\n\n",
 }

 ref_answer_template_correctness = {
    "en": "\nA correct answer is as follows:\n\n{answer}\n\n",
-    "cn": "\n标准答案如下：\n\n{answer}\n\n"
+    "cn": "\n标准答案如下：\n\n{answer}\n\n",
 }


@@ -51,10 +49,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
-                    {
-                        "role": "system",
-                        "content": sys_prompt
-                    },
+                    {"role": "system", "content": sys_prompt},
                    {
                        "role": "user",
                        "content": user_prompt,
@@ -106,7 +101,7 @@ def parse_battle_score(evaluation: str) -> List[float]:
            return [float(sp[0]), float(sp[1])]
        else:
            raise Exception(f"Invalid score pair. Got {evaluation}.")
-    except Exception as e:
+    except Exception:
        return [-1, -1]


@@ -125,9 +120,6 @@ def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]

    assert len(answer1) == len(answer2)

-    handles = []
-    evaluation_file = []
-
    total_len = len(answer1)
    question_idx_list = list(range(total_len))

@@ -140,9 +132,12 @@ def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]
            assert answer1[i]["id"] == answer2[i]["id"]
            answer_id = answer1[i]["id"]

-            ques = answer1[i]["instruction"] if answer1[i][
-                "input"] == "" else answer1[i]["instruction"] + " " + answer1[i]["input"]
-            cat = answer1[i]["category"]
+            ques = (
+                answer1[i]["instruction"]
+                if answer1[i]["input"] == ""
+                else answer1[i]["instruction"] + " " + answer1[i]["input"]
+            )
+            answer1[i]["category"]
            ans1 = answer1[i]["output"]
            ans2 = answer2[i]["output"]

@@ -267,7 +262,11 @@ def reference_template(metric: str, language: str, reference: Dict[str, Any]) ->

    step_to_add = ref_step_template[language]

-    for_the_given_answer = "{metric} (1-5) (directly give the score for the given answer):" if language == "en" else "{metric} (1-5) (直接对给定答案打分)"
+    for_the_given_answer = (
+        "{metric} (1-5) (directly give the score for the given answer):"
+        if language == "en"
+        else "{metric} (1-5) (直接对给定答案打分)"
+    )

    # adjective is used to describe the word "answer" in the prompt.
    adjective = "example" if language == "en" else "示例"
@@ -280,8 +279,9 @@ def reference_template(metric: str, language: str, reference: Dict[str, Any]) ->
        answer_to_add = ref_answer_template_correctness[language]

    answer_to_add = answer_to_add.format(answer=reference["target"] if reference["target"] else reference["output"])
-    step_to_add = step_to_add.format(metric=metric.lower(),
-                                     adjective=adjective) + for_the_given_answer.format(metric=metric)
+    step_to_add = step_to_add.format(metric=metric.lower(), adjective=adjective) + for_the_given_answer.format(
+        metric=metric
+    )

    return answer_to_add + step_to_add

@@ -329,7 +329,8 @@ def multiturn_chat_completion(user_messages: List[str], model: str, max_tokens:
        for j in range(i):
            messages_to_send.append(fill_in_message("user", user_messages[j]))
            messages_to_send.append(
-                fill_in_message("assistant", assistant_responses[j]["choices"][0]["message"]["content"]))
+                fill_in_message("assistant", assistant_responses[j]["choices"][0]["message"]["content"])
+            )

        # Length of user messages == Length of assistant messages + 1
        # Because we always expect the api to response
@@ -351,13 +352,15 @@ def multiturn_chat_completion(user_messages: List[str], model: str, max_tokens:
    return assistant_responses[-1]


-def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
-                                        inst: Dict[str, Any],
-                                        metrics: List[str],
-                                        language: str,
-                                        reference: Dict[str, Any] = None,
-                                        model: str = "gpt-3.5-turbo",
-                                        max_tokens: int = 2048) -> Dict[str, Any]:
+def get_gpt_evaluation_without_logprobs(
+    prompt: Dict[str, Any],
+    inst: Dict[str, Any],
+    metrics: List[str],
+    language: str,
+    reference: Dict[str, Any] = None,
+    model: str = "gpt-3.5-turbo",
+    max_tokens: int = 2048,
+) -> Dict[str, Any]:
    """
    Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.

@@ -378,7 +381,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],

    MAX_API_RETRY = 3

-    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"])
+    question = inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"]
    answer = inst["output"]
    inst["evaluation"] = {}

@@ -400,10 +403,9 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],

                if prompt_reference:
                    # Do a 2-round conversation
-                    response = multiturn_chat_completion([prompt_1st_round, prompt_reference],
-                                                         model,
-                                                         max_tokens=max_tokens,
-                                                         turns=2)
+                    response = multiturn_chat_completion(
+                        [prompt_1st_round, prompt_reference], model, max_tokens=max_tokens, turns=2
+                    )
                else:
                    response = multiturn_chat_completion([prompt_1st_round], model, max_tokens=max_tokens, turns=1)

@@ -427,10 +429,9 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
    return inst


-def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
-                                     inst: Dict[str, Any],
-                                     metrics: List[str],
-                                     max_tokens: int = 2048) -> Dict[str, Any]:
+def get_gpt_evaluation_with_logprobs(
+    prompt: Dict[str, Any], inst: Dict[str, Any], metrics: List[str], max_tokens: int = 2048
+) -> Dict[str, Any]:
    """
    Use completion model(text-davinci-003) to evaluate one model answer.
    Only completion models can return log probabilities.
@@ -449,7 +450,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],

    MAX_API_RETRY = 3

-    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"])
+    question = inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"]
    answer = inst["output"]
    inst["evaluation"] = {}

@@ -492,13 +493,15 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
    return inst


-def evaluate(answers: List[Dict],
-             prompt: Dict[str, Any],
-             metrics: List[str],
-             category: str,
-             model: str,
-             language: str,
-             references: List[Dict] = None) -> List[Dict]:
+def evaluate(
+    answers: List[Dict],
+    prompt: Dict[str, Any],
+    metrics: List[str],
+    category: str,
+    model: str,
+    language: str,
+    references: List[Dict] = None,
+) -> List[Dict]:
    """
    Use GPT models to evaluate model answers and save evaluation results.

@@ -529,21 +532,23 @@ def evaluate(answers: List[Dict],
            if model == "text-davinci-003":
                future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
            else:
-                future = executor.submit(get_gpt_evaluation_without_logprobs,
-                                         prompt,
-                                         inst,
-                                         metrics,
-                                         language,
-                                         reference=None if references is None else references[idx],
-                                         model=model,
-                                         max_tokens=1)
+                future = executor.submit(
+                    get_gpt_evaluation_without_logprobs,
+                    prompt,
+                    inst,
+                    metrics,
+                    language,
+                    reference=None if references is None else references[idx],
+                    model=model,
+                    max_tokens=1,
+                )

            futures.append(future)

        for future in tqdm.tqdm(
-                concurrent.futures.as_completed(futures),
-                desc=f"{category}: ",
-                total=len(futures),
+            concurrent.futures.as_completed(futures),
+            desc=f"{category}: ",
+            total=len(futures),
        ):
            evaluations.append(future.result())

@@ -610,12 +615,13 @@ def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) ->
            return int(results[0])
        else:
            raise Exception(f"Invalid score pair. Got {evaluation}.")
-    except Exception as e:
+    except Exception:
        return 0


-def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
-                                save_path: str) -> Dict[str, Any]:
+def save_gpt_evaluation_results(
+    model_name: str, gpt_evaluation_results: Dict[str, Any], save_path: str
+) -> Dict[str, Any]:
    """
    Save evaluation results for different categories for one model.

@@ -667,10 +673,12 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav
                    scores[metric].append(0)
                elif evaluation["evaluation"][metric]["logprobs"] is not None:
                    scores[metric].append(
-                        calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+                        calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0])
+                    )
                else:
                    scores[metric].append(
-                        calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
+                        calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation)
+                    )

        statistics = {}
        for metric in metrics:
@@ -751,9 +759,9 @@ def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> N
    frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))

    for category in tqdm.tqdm(
-            frame_per_category.keys(),
-            desc=f"GPT evaluation: ",
-            total=len(frame_per_category.keys()),
+        frame_per_category.keys(),
+        desc=f"GPT evaluation: ",
+        total=len(frame_per_category.keys()),
    ):
        data = pd.DataFrame(frame_per_category[category])


--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
@@ -21,13 +21,17 @@ def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str,
    """
    bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
    cumulative_bleu = [0] * 4
-    weights = [(1. / 1., 0., 0., 0.), (1. / 2., 1. / 2., 0., 0.), (1. / 3., 1. / 3., 1. / 3., 0.),
-               (1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
+    weights = [
+        (1.0 / 1.0, 0.0, 0.0, 0.0),
+        (1.0 / 2.0, 1.0 / 2.0, 0.0, 0.0),
+        (1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0, 0.0),
+        (1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0),
+    ]

    for pred, target in zip(preds, targets):
        if language == "cn":
-            pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
-            target_list = [(' '.join(jieba.cut(preprocessing_text(target)))).split()]
+            pred_list = " ".join(jieba.cut(preprocessing_text(pred))).split()
+            target_list = [(" ".join(jieba.cut(preprocessing_text(target)))).split()]
        elif language == "en":
            pred_list = preprocessing_text(pred).split()
            target_list = [preprocessing_text(target).split()]
@@ -42,15 +46,14 @@ def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str,


 def chrf_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Calculate CHRF Score Metric in sentence level.
-    """
+    """Calculate CHRF Score Metric in sentence level."""
    chrf_score = {"chrf": 0}
    cumulative_chrf = []

    for pred, target in zip(preds, targets):
        if language == "cn":
-            pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
-            target_list = ' '.join(jieba.cut(preprocessing_text(target))).split()
+            pred_list = " ".join(jieba.cut(preprocessing_text(pred))).split()
+            target_list = " ".join(jieba.cut(preprocessing_text(target))).split()
        elif language == "en":
            pred_list = preprocessing_text(pred).split()
            target_list = preprocessing_text(target).split()
@@ -75,8 +78,8 @@ def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
    all_targets = []

    for pred, target in zip(preds, targets):
-        pred_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(pred))))
-        target_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(target))))
+        pred_list = remove_redundant_space(" ".join(jieba.cut(preprocessing_text(pred))))
+        target_list = remove_redundant_space(" ".join(jieba.cut(preprocessing_text(target))))
        all_preds.append(pred_list)
        all_targets.append(target_list)

@@ -99,16 +102,14 @@ def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
    longest common subsequence (LCS) between preds and targets.
    """
    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
-    all_preds = []
-    all_targets = []

    rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)

    for pred, target in zip(preds, targets):
        score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
-        rouge_scores["rouge1"] += score['rouge1'].fmeasure
-        rouge_scores["rouge2"] += score['rouge2'].fmeasure
-        rouge_scores["rougeL"] += score['rougeL'].fmeasure
+        rouge_scores["rouge1"] += score["rouge1"].fmeasure
+        rouge_scores["rouge2"] += score["rouge2"].fmeasure
+        rouge_scores["rougeL"] += score["rougeL"].fmeasure

    rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
    rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
@@ -137,7 +138,7 @@ def distinct_score(preds: List[str], language: str) -> Dict[str, float]:

    for pred in preds:
        if language == "cn":
-            pred_seg_list = ' '.join(jieba.cut(pred)).split()
+            pred_seg_list = " ".join(jieba.cut(pred)).split()
            count_segs = len(pred_seg_list)
            unique_segs = set(pred_seg_list)
            count_unique_chars = len(unique_segs)
@@ -151,7 +152,7 @@ def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
            split_pred = preprocessing_text(pred).split()
            for n in range(0, 3):
                for i in range(0, len(split_pred) - n):
-                    ngram = ' '.join(split_pred[i:i + n + 1])
+                    ngram = " ".join(split_pred[i : i + n + 1])
                    unique_ngram[n].add(ngram)
                    all_ngram_count[n] += 1

@@ -203,8 +204,8 @@ def calculate_precision_recall_f1(preds: List[str], targets: List[str], language

    for pred, target in zip(preds, targets):
        if language == "cn":
-            pred_list = [char for char in ' '.join(jieba.cut(preprocessing_text(pred))).split()]
-            target_list = [char for char in ' '.join(jieba.cut(preprocessing_text(target))).split()]
+            pred_list = [char for char in " ".join(jieba.cut(preprocessing_text(pred))).split()]
+            target_list = [char for char in " ".join(jieba.cut(preprocessing_text(target))).split()]
        elif language == "en":
            pred_list = [char for char in preprocessing_text(pred).split()]
            target_list = [char for char in preprocessing_text(target).split()]

--- a/applications/Chat/evaluate/unieval/__init__.py
+++ b/applications/Chat/evaluate/unieval/__init__.py
@@ -7,6 +7,9 @@ from .utils import (
 )

 __all__ = [
-    'get_evaluator', 'convert_data_to_unieval_format', 'calculate_average_score', 'save_unieval_results',
-    'analyze_unieval_results'
+    "get_evaluator",
+    "convert_data_to_unieval_format",
+    "calculate_average_score",
+    "save_unieval_results",
+    "analyze_unieval_results",
 ]
--- a/applications/Chat/evaluate/unieval/evaluator.py
+++ b/applications/Chat/evaluate/unieval/evaluator.py
@@ -28,29 +28,29 @@ from .utils import add_question


 class SumEvaluator:
-
-    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
-        """ Set up evaluator for text summarization """
+    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
+        """Set up evaluator for text summarization"""
        self.scorer = UniEvaluator(
-            model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
+            model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
            max_length=max_length,
            device=device,
-            cache_dir=cache_dir)
-        self.task = 'summarization'
-        self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance']
+            cache_dir=cache_dir,
+        )
+        self.task = "summarization"
+        self.dimensions = ["coherence", "consistency", "fluency", "relevance"]

    def evaluate(self, data, category, dims=None, overall=True):
        """
-            Get the scores of all the given dimensions
+        Get the scores of all the given dimensions

-            category: The category to be evaluated.
+        category: The category to be evaluated.

-            dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
-                  four dimensions: coherence, consistency, fluency, relevance.
+        dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
+              four dimensions: coherence, consistency, fluency, relevance.

-            overall: indicates whether the overall score is to be calculated.
-                     Overall score can be customized to a combination of scores based on different
-                     dimensions. The default here is the average score of all the given dimensions.
+        overall: indicates whether the overall score is to be calculated.
+                 Overall score can be customized to a combination of scores based on different
+                 dimensions. The default here is the average score of all the given dimensions.
        """
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]
@@ -63,12 +63,12 @@ class SumEvaluator:

        for dim in eval_dims:
            # Calculate average sentence-level scores for 'consistency' and 'fluency'
-            if dim == 'consistency' or dim == 'fluency':
+            if dim == "consistency" or dim == "fluency":
                src_list, output_list = [], []
-                n_sents = []    # the number of sentences in each generated summary
+                n_sents = []  # the number of sentences in each generated summary
                for i in range(n_data):
-                    source = data[i]['source']
-                    system_outputs = sent_tokenize(data[i]['system_output'])
+                    source = data[i]["source"]
+                    system_outputs = sent_tokenize(data[i]["system_output"])
                    n_sents.append(len(system_outputs))
                    for j in range(len(system_outputs)):
                        src_list.append(source)
@@ -81,24 +81,26 @@ class SumEvaluator:
                score = []
                for cur_n_sent in n_sents:
                    # prevent denominator from being 0
-                    score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
+                    score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
                    start_idx += cur_n_sent

            # Calculate summary-level score for 'coherence' and 'relevance'
-            elif dim == 'coherence' or dim == 'relevance':
+            elif dim == "coherence" or dim == "relevance":
                src_list, output_list, ref_list = [], [], []
                for i in range(n_data):
-                    src_list.append(data[i]['source'])
-                    output_list.append(data[i]['system_output'])
-                    if dim == 'relevance':
-                        ref_list.append(data[i]['reference'])
+                    src_list.append(data[i]["source"])
+                    output_list.append(data[i]["system_output"])
+                    if dim == "relevance":
+                        ref_list.append(data[i]["reference"])
                input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
                score = self.scorer.score(input_list, self.task, category, dim)

            # Please customize other dimensions here for summarization
            else:
-                raise NotImplementedError('The input format for this dimension is still undefined. \
-                                           Please customize it first.')
+                raise NotImplementedError(
+                    "The input format for this dimension is still undefined. \
+                                           Please customize it first."
+                )

            for i in range(n_data):
                eval_scores[i][dim] = score[i]
@@ -106,35 +108,35 @@ class SumEvaluator:
        # Customize your overall score here.
        if overall == True:
            for i in range(n_data):
-                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))

        return eval_scores


 class DialogEvaluator:
-
-    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
-        """ Set up evaluator for dialogues """
+    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
+        """Set up evaluator for dialogues"""
        self.scorer = UniEvaluator(
-            model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path,
+            model_name_or_path="MingZhong/unieval-dialog" if model_name_or_path == "" else model_name_or_path,
            max_length=max_length,
            device=device,
-            cache_dir=cache_dir)
-        self.task = 'dialogue'
-        self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability']
+            cache_dir=cache_dir,
+        )
+        self.task = "dialogue"
+        self.dimensions = ["naturalness", "coherence", "engagingness", "groundedness", "understandability"]

    def evaluate(self, data, category, dims=None, overall=True):
        """
-            Get the scores of all the given dimensions
+        Get the scores of all the given dimensions

-            category: The category to be evaluated.
+        category: The category to be evaluated.

-            dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
-                  five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
+        dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
+              five dimensions: naturalness, coherence, engagingness, groundedness and understandability.

-            overall: indicates whether the overall score is to be calculated.
-                     Overall score can be customized to a combination of scores based on different
-                     dimensions. The default here is the average score of all the given dimensions.
+        overall: indicates whether the overall score is to be calculated.
+                 Overall score can be customized to a combination of scores based on different
+                 dimensions. The default here is the average score of all the given dimensions.
        """
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]
@@ -147,50 +149,48 @@ class DialogEvaluator:

        for dim in eval_dims:
            # Calculate summation score for 'engagingness'
-            if dim == 'engagingness':
+            if dim == "engagingness":
                src_list, output_list, context_list = [], [], []
-                n_sents = []    # the number of sentences in each generated response
+                n_sents = []  # the number of sentences in each generated response
                for i in range(n_data):
-                    source = data[i]['source']
-                    context = data[i]['context']
-                    system_outputs = sent_tokenize(data[i]['system_output'])
+                    source = data[i]["source"]
+                    context = data[i]["context"]
+                    system_outputs = sent_tokenize(data[i]["system_output"])
                    n_sents.append(len(system_outputs))
                    for j in range(len(system_outputs)):
                        src_list.append(source)
                        context_list.append(context)
                        output_list.append(system_outputs[j])
-                input_list = add_question(dimension=dim,
-                                          output=output_list,
-                                          src=src_list,
-                                          context=context_list,
-                                          task=self.task)
+                input_list = add_question(
+                    dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
+                )
                sent_score = self.scorer.score(input_list, self.task, category, dim)

                # Get the summation score for each sample
                start_idx = 0
                score = []
                for cur_n_sent in n_sents:
-                    score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]))
+                    score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]))
                    start_idx += cur_n_sent

            # Calculate turn-level score for other dimensions
-            elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']:
+            elif dim in ["naturalness", "coherence", "groundedness", "understandability"]:
                src_list, output_list, context_list = [], [], []
                for i in range(n_data):
-                    src_list.append(data[i]['source'])
-                    output_list.append(data[i]['system_output'])
-                    context_list.append(data[i]['context'])
-                input_list = add_question(dimension=dim,
-                                          output=output_list,
-                                          src=src_list,
-                                          context=context_list,
-                                          task=self.task)
+                    src_list.append(data[i]["source"])
+                    output_list.append(data[i]["system_output"])
+                    context_list.append(data[i]["context"])
+                input_list = add_question(
+                    dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
+                )
                score = self.scorer.score(input_list, self.task, category, dim)

            # Please customize other dimensions here for summarization
            else:
-                raise NotImplementedError('The input format for this dimension is still undefined. \
-                                           Please customize it first.')
+                raise NotImplementedError(
+                    "The input format for this dimension is still undefined. \
+                                           Please customize it first."
+                )

            for i in range(n_data):
                eval_scores[i][dim] = score[i]
@@ -198,35 +198,35 @@ class DialogEvaluator:
        # Customize your overall score here.
        if overall == True:
            for i in range(n_data):
-                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))

        return eval_scores


 class D2tEvaluator:
-
-    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
-        """ Set up evaluator for data-to-text """
+    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
+        """Set up evaluator for data-to-text"""
        self.scorer = UniEvaluator(
-            model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
+            model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
            max_length=max_length,
            device=device,
-            cache_dir=cache_dir)
-        self.task = 'data2text'
-        self.dimensions = ['naturalness', 'informativeness']
+            cache_dir=cache_dir,
+        )
+        self.task = "data2text"
+        self.dimensions = ["naturalness", "informativeness"]

    def evaluate(self, data, category, dims=None, overall=True):
        """
-            Get the scores of all the given dimensions
+        Get the scores of all the given dimensions

-            category: The category to be evaluated.
+        category: The category to be evaluated.

-            dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
-                  two dimensions: naturalness and informativeness.
+        dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
+              two dimensions: naturalness and informativeness.

-            overall: indicates whether the overall score is to be calculated.
-                     Overall score can be customized to a combination of scores based on different
-                     dimensions. The default here is the average score of all the given dimensions.
+        overall: indicates whether the overall score is to be calculated.
+                 Overall score can be customized to a combination of scores based on different
+                 dimensions. The default here is the average score of all the given dimensions.
        """
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]
@@ -240,8 +240,8 @@ class D2tEvaluator:
        for dim in eval_dims:
            output_list, ref_list = [], []
            for i in range(n_data):
-                output_list.append(data[i]['system_output'])
-                ref_list.append(data[i]['reference'])
+                output_list.append(data[i]["system_output"])
+                ref_list.append(data[i]["reference"])

            input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
            score = self.scorer.score(input_list, self.task, category, dim)
@@ -252,38 +252,38 @@ class D2tEvaluator:
        # Customize your overall score here.
        if overall == True:
            for i in range(n_data):
-                eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
+                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))

        return eval_scores


 class FactEvaluator:
-
-    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
-        """ Set up evaluator for factual consistency detection """
+    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
+        """Set up evaluator for factual consistency detection"""
        self.scorer = UniEvaluator(
-            model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path,
+            model_name_or_path="MingZhong/unieval-fact" if model_name_or_path == "" else model_name_or_path,
            max_length=max_length,
            device=device,
-            cache_dir=cache_dir)
-        self.task = 'fact'
-        self.dim = 'consistency'
+            cache_dir=cache_dir,
+        )
+        self.task = "fact"
+        self.dim = "consistency"

    def evaluate(self, data, category):
        """
-            Get the factual consistency score (only 1 dimension for this task)
+        Get the factual consistency score (only 1 dimension for this task)

-            category: The category to be evaluated.
+        category: The category to be evaluated.
        """
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]

        # Calculate average sentence-level scores for factual consistency
        src_list, output_list = [], []
-        n_sents = []    # the number of sentences in the claim
+        n_sents = []  # the number of sentences in the claim
        for i in range(n_data):
-            source = data[i]['source']
-            system_outputs = sent_tokenize(data[i]['system_output'])
+            source = data[i]["source"]
+            system_outputs = sent_tokenize(data[i]["system_output"])
            n_sents.append(len(system_outputs))
            for j in range(len(system_outputs)):
                src_list.append(source)
@@ -295,7 +295,7 @@ class FactEvaluator:
        start_idx = 0
        score = []
        for cur_n_sent in n_sents:
-            score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
+            score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / cur_n_sent)
            start_idx += cur_n_sent

        for i in range(n_data):
@@ -304,28 +304,26 @@ class FactEvaluator:
        return eval_scores


-def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None):
-    assert task in ['summarization', 'dialogue', 'data2text', 'fact']
-    if task == 'summarization':
-        return SumEvaluator(model_name_or_path=model_name_or_path,
-                            max_length=max_length,
-                            device=device,
-                            cache_dir=cache_dir)
-    elif task == 'dialogue':
-        return DialogEvaluator(model_name_or_path=model_name_or_path,
-                               max_length=max_length,
-                               device=device,
-                               cache_dir=cache_dir)
-    elif task == 'data2text':
-        return D2tEvaluator(model_name_or_path=model_name_or_path,
-                            max_length=max_length,
-                            device=device,
-                            cache_dir=cache_dir)
-    elif task == 'fact':
-        return FactEvaluator(model_name_or_path=model_name_or_path,
-                             max_length=max_length,
-                             device=device,
-                             cache_dir=cache_dir)
+def get_evaluator(task, model_name_or_path="", max_length=1024, device="cuda:0", cache_dir=None):
+    assert task in ["summarization", "dialogue", "data2text", "fact"]
+    if task == "summarization":
+        return SumEvaluator(
+            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
+        )
+    elif task == "dialogue":
+        return DialogEvaluator(
+            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
+        )
+    elif task == "data2text":
+        return D2tEvaluator(
+            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
+        )
+    elif task == "fact":
+        return FactEvaluator(
+            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
+        )
    else:
-        raise NotImplementedError('Other tasks are not implemented, \
-                                   please customize specific tasks here.')
+        raise NotImplementedError(
+            "Other tasks are not implemented, \
+                                   please customize specific tasks here."
+        )
--- a/applications/Chat/evaluate/unieval/scorer.py
+++ b/applications/Chat/evaluate/unieval/scorer.py
@@ -27,9 +27,8 @@ from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer


 class UniEvaluator:
-
-    def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
-        """ Set up model """
+    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
+        """Set up model"""
        self.device = device
        self.max_length = max_length

@@ -47,8 +46,8 @@ class UniEvaluator:

    def score(self, inputs, task, category, dim, batch_size=8):
        """
-            Get scores for the given samples.
-            final_score = postive_score / (postive_score + negative_score)
+        Get scores for the given samples.
+        final_score = postive_score / (postive_score + negative_score)
        """

        # The implementation of "forward" in T5 still requires decoder_input_ids.
@@ -58,31 +57,27 @@ class UniEvaluator:

        pos_score_list, neg_score_list = [], []
        for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "):
-            src_list = inputs[i:i + batch_size]
-            tgt_list = tgts[i:i + batch_size]
+            src_list = inputs[i : i + batch_size]
+            tgt_list = tgts[i : i + batch_size]
            try:
                with torch.no_grad():
-                    encoded_src = self.tokenizer(src_list,
-                                                 max_length=self.max_length,
-                                                 truncation=True,
-                                                 padding=True,
-                                                 return_tensors='pt')
-                    encoded_tgt = self.tokenizer(tgt_list,
-                                                 max_length=self.max_length,
-                                                 truncation=True,
-                                                 padding=True,
-                                                 return_tensors='pt')
-
-                    src_tokens = encoded_src['input_ids'].to(self.device)
-                    src_mask = encoded_src['attention_mask'].to(self.device)
-
-                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)[:, 0].unsqueeze(-1)
+                    encoded_src = self.tokenizer(
+                        src_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
+                    )
+                    encoded_tgt = self.tokenizer(
+                        tgt_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
+                    )
+
+                    src_tokens = encoded_src["input_ids"].to(self.device)
+                    src_mask = encoded_src["attention_mask"].to(self.device)
+
+                    tgt_tokens = encoded_tgt["input_ids"].to(self.device)[:, 0].unsqueeze(-1)

                    output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens)
                    logits = output.logits.view(-1, self.model.config.vocab_size)

-                    pos_score = self.softmax(logits)[:, self.pos_id]    # Yes
-                    neg_score = self.softmax(logits)[:, self.neg_id]    # No
+                    pos_score = self.softmax(logits)[:, self.pos_id]  # Yes
+                    neg_score = self.softmax(logits)[:, self.neg_id]  # No

                    cur_pos_score = [x.item() for x in pos_score]
                    cur_neg_score = [x.item() for x in neg_score]
@@ -90,8 +85,8 @@ class UniEvaluator:
                    neg_score_list += cur_neg_score

            except RuntimeError:
-                print(f'source: {src_list}')
-                print(f'target: {tgt_list}')
+                print(f"source: {src_list}")
+                print(f"target: {tgt_list}")
                exit(0)

        score_list = []

--- a/applications/Chat/evaluate/unieval/utils.py
+++ b/applications/Chat/evaluate/unieval/utils.py
@@ -31,105 +31,142 @@ import tqdm

 def add_question(dimension, output, src=None, ref=None, context=None, task=None):
    """
-        Add questions to generate input in Bool-QA format for UniEval.
-
-        dimension: specific dimension to be evaluated
-        src: source input for different NLG tasks. For example, source document for summarization
-             and dialogue history for dialogue response generation.
-        output: output text generated by the models
-        ref: human-annotated groundtruth
-        context: the context needed to evaluate several specific dimension. For example,
-                 additional factual information when evaluating engagingness and groundedness in dialogues.
+    Add questions to generate input in Bool-QA format for UniEval.
+
+    dimension: specific dimension to be evaluated
+    src: source input for different NLG tasks. For example, source document for summarization
+         and dialogue history for dialogue response generation.
+    output: output text generated by the models
+    ref: human-annotated groundtruth
+    context: the context needed to evaluate several specific dimension. For example,
+             additional factual information when evaluating engagingness and groundedness in dialogues.
    """

    input_with_question = []
    for i in range(len(output)):
        # For summarization
-        if task == 'summarization':
-            if dimension == 'fluency':
-                cur_input = 'question: Is this a fluent paragraph? </s> paragraph: ' + output[i]
-            elif dimension == 'coherence':
-                cur_input = 'question: Is this a coherent summary to the document? </s> summary: ' + output[
-                    i] + ' </s> document: ' + src[i]
-            elif dimension == 'consistency':
-                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
-                    i] + ' </s> document: ' + src[i]
-            elif dimension == 'relevance':
-                cur_input = 'question: Is this summary relevant to the reference? </s> summary: ' + output[
-                    i] + ' </s> reference: ' + ref[i]
+        if task == "summarization":
+            if dimension == "fluency":
+                cur_input = "question: Is this a fluent paragraph? </s> paragraph: " + output[i]
+            elif dimension == "coherence":
+                cur_input = (
+                    "question: Is this a coherent summary to the document? </s> summary: "
+                    + output[i]
+                    + " </s> document: "
+                    + src[i]
+                )
+            elif dimension == "consistency":
+                cur_input = (
+                    "question: Is this claim consistent with the document? </s> claim: "
+                    + output[i]
+                    + " </s> document: "
+                    + src[i]
+                )
+            elif dimension == "relevance":
+                cur_input = (
+                    "question: Is this summary relevant to the reference? </s> summary: "
+                    + output[i]
+                    + " </s> reference: "
+                    + ref[i]
+                )
            else:
                raise NotImplementedError(
-                    'The input format for this dimension is still undefined. Please customize it first.')
+                    "The input format for this dimension is still undefined. Please customize it first."
+                )
        # For dialogues
-        elif task == 'dialogue':
-            if dimension == 'naturalness':
-                cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + output[i]
-            elif dimension == 'coherence':
-                cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: '\
-                            + output[i] + ' </s> dialogue history: ' + src[i]
-            elif dimension == 'engagingness':
-                cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: '\
-                            + output[i] + ' </s> dialogue history: ' + src[i] + ' </s> fact: ' + context[i]
-            elif dimension == 'groundedness':
-                cur_input = 'question: Is this response consistent with knowledge in the fact? </s> response: '\
-                            + output[i] + ' </s> fact: ' + context[i]
-            elif dimension == 'understandability':
-                cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + output[i]
+        elif task == "dialogue":
+            if dimension == "naturalness":
+                cur_input = "question: Is this a natural response in the dialogue? </s> response: " + output[i]
+            elif dimension == "coherence":
+                cur_input = (
+                    "question: Is this a coherent response given the dialogue history? </s> response: "
+                    + output[i]
+                    + " </s> dialogue history: "
+                    + src[i]
+                )
+            elif dimension == "engagingness":
+                cur_input = (
+                    "question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: "
+                    + output[i]
+                    + " </s> dialogue history: "
+                    + src[i]
+                    + " </s> fact: "
+                    + context[i]
+                )
+            elif dimension == "groundedness":
+                cur_input = (
+                    "question: Is this response consistent with knowledge in the fact? </s> response: "
+                    + output[i]
+                    + " </s> fact: "
+                    + context[i]
+                )
+            elif dimension == "understandability":
+                cur_input = "question: Is this an understandable response in the dialogue? </s> response: " + output[i]
            else:
                raise NotImplementedError(
-                    'The input format for this dimension is still undefined. Please customize it first.')
+                    "The input format for this dimension is still undefined. Please customize it first."
+                )
        # For data-to-text
-        elif task == 'data2text':
-            if dimension == 'naturalness':
-                cur_input = 'question: Is this a fluent utterance? </s> utterance: ' + output[i]
-            elif dimension == 'informativeness':
-                cur_input = 'question: Is this sentence informative according to the reference? </s> sentence: '\
-                            + output[i] + ' </s> reference: ' + ref[i]
+        elif task == "data2text":
+            if dimension == "naturalness":
+                cur_input = "question: Is this a fluent utterance? </s> utterance: " + output[i]
+            elif dimension == "informativeness":
+                cur_input = (
+                    "question: Is this sentence informative according to the reference? </s> sentence: "
+                    + output[i]
+                    + " </s> reference: "
+                    + ref[i]
+                )
            else:
                raise NotImplementedError(
-                    'The input format for this dimension is still undefined. Please customize it first.')
+                    "The input format for this dimension is still undefined. Please customize it first."
+                )
        # For factual consistency detection
-        elif task == 'fact':
-            if dimension == 'consistency':
-                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
-                    i] + ' </s> document: ' + src[i]
+        elif task == "fact":
+            if dimension == "consistency":
+                cur_input = (
+                    "question: Is this claim consistent with the document? </s> claim: "
+                    + output[i]
+                    + " </s> document: "
+                    + src[i]
+                )
            else:
-                raise NotImplementedError('No other dimensions for the factual consistency detection task.')
+                raise NotImplementedError("No other dimensions for the factual consistency detection task.")
        # For new customized tasks
        else:
-            raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.')
+            raise NotImplementedError("Other tasks are not implemented, please customize specific tasks here.")
        input_with_question.append(cur_input)
    return input_with_question


 def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None):
    """
-        Convert the data into the unieval's format.
+    Convert the data into the unieval's format.

-        output_list: a list of model output
+    output_list: a list of model output

-        src_list: source input for different NLG tasks. For example, source document for summarization
-                  and dialogue history for dialogue response generation
-        ref_list: human-annotated groundtruth
+    src_list: source input for different NLG tasks. For example, source document for summarization
+              and dialogue history for dialogue response generation
+    ref_list: human-annotated groundtruth
    """
    json_data = []
    for i in range(len(output_list)):
        cur = {}
-        cur['system_output'] = output_list[i]
+        cur["system_output"] = output_list[i]
        if src_list is not None:
-            cur['source'] = src_list[i]
+            cur["source"] = src_list[i]
        if ref_list is not None:
-            cur['reference'] = ref_list[i]
-        cur['context'] = ""
+            cur["reference"] = ref_list[i]
+        cur["context"] = ""
        json_data.append(cur)
    return json_data


 def calculate_average_score(scores):
    """
-        Calculate average scores for different metrics
+    Calculate average scores for different metrics

-        scores: a list of scores for different metrics for each answer
+    scores: a list of scores for different metrics for each answer

    """
    metrics = {metric: 0 for metric in scores[0]}
@@ -226,9 +263,9 @@ def analyze_unieval_results(results_path: str, save_path: str) -> None:
    frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv"))

    for metric in tqdm.tqdm(
-            frame_per_metric.keys(),
-            desc=f"UniEval metrics: ",
-            total=len(frame_per_metric.keys()),
+        frame_per_metric.keys(),
+        desc=f"UniEval metrics: ",
+        total=len(frame_per_metric.keys()),
    ):
        data = pd.DataFrame(frame_per_metric[metric])


--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
 import io
 import json
 import os
-import re
 import string
 from typing import Dict

@@ -55,7 +54,7 @@ def jload(f, mode="r"):


 def get_json_list(file_path):
-    with open(file_path, 'r') as f:
+    with open(file_path, "r") as f:
        json_list = []
        for line in f:
            json_list.append(json.loads(line))
@@ -187,9 +186,9 @@ def analyze_automatic_results(results_path: str, save_path: str) -> None:
    frame_all.to_csv(os.path.join(save_path, "automatic_evaluation_statistics.csv"))

    for metric in tqdm.tqdm(
-            frame_per_metric.keys(),
-            desc=f"automatic metrics: ",
-            total=len(frame_per_metric.keys()),
+        frame_per_metric.keys(),
+        desc=f"automatic metrics: ",
+        total=len(frame_per_metric.keys()),
    ):
        data = pd.DataFrame(frame_per_metric[metric])


--- a/applications/Chat/examples/community/peft/easy_dataset.py
+++ b/applications/Chat/examples/community/peft/easy_dataset.py
@@ -3,7 +3,6 @@ import json
 from typing import Dict, Sequence

 import torch
-from datasets import load_dataset
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import AutoTokenizer
@@ -20,7 +19,8 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: AutoTokenizer, max_length: i
            padding="longest",
            max_length=max_length,
            truncation=True,
-        ) for text in strings
+        )
+        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
@@ -48,18 +48,17 @@ def preprocess(sources: Sequence[str], targets: Sequence[str], tokenizer: AutoTo


 class EasySupervisedDataset(Dataset):
-
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None:
        super(EasySupervisedDataset, self).__init__()
        with open(data_file, "r", encoding="UTF-8") as f:
            all_lines = f.readlines()
-        #split to source and target ,source the characters before "回答：" including "回答：", target the characters after "回答："
+        # split to source and target ,source the characters before "回答：" including "回答：", target the characters after "回答："
        sources, targets = [], []
        for line in all_lines:
            if "回答：" in line:
                sep_index = line.index("回答：")
-                sources.append(line[:sep_index + 3])
-                targets.append(line[sep_index + 3:] + tokenizer.eos_token)
+                sources.append(line[: sep_index + 3])
+                targets.append(line[sep_index + 3 :] + tokenizer.eos_token)
            else:
                sources.append(line)
                targets.append("" + tokenizer.eos_token)
@@ -83,15 +82,17 @@ class EasySupervisedDataset(Dataset):


 class EasyPromptsDataset(Dataset):
-
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None:
        super(EasyPromptsDataset, self).__init__()
        with open(data_file, "r", encoding="UTF-8") as f:
            all_lines = f.readlines()
-            all_lines = [line if "回答：" not in line else line[:line.index("回答：") + 3] for line in all_lines]
+            all_lines = [line if "回答：" not in line else line[: line.index("回答：") + 3] for line in all_lines]
        self.prompts = [
-            tokenizer(line, return_tensors='pt', max_length=max_length, padding='max_length',
-                      truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
+            tokenizer(line, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True)[
+                "input_ids"
+            ]
+            .to(torch.cuda.current_device())
+            .squeeze(0)
            for line in tqdm(all_lines)
        ]
        self.data_file = data_file
@@ -110,7 +111,6 @@ class EasyPromptsDataset(Dataset):


 class EasyRewardDataset(Dataset):
-
    def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
        super(EasyRewardDataset, self).__init__()
        self.chosen = []
@@ -120,44 +120,42 @@ class EasyRewardDataset(Dataset):
        else:
            self.end_token = special_token
        print(self.end_token)
-        #read all lines in the train_file to a list
+        # read all lines in the train_file to a list
        with open(train_file, "r", encoding="UTF-8") as f:
            all_lines = f.readlines()
        for line in tqdm(all_lines):
            data = json.loads(line)
-            prompt = "提问：" + data['prompt'] + " 回答："
-
-            chosen = prompt + data['chosen'] + self.end_token
-            chosen_token = tokenizer(chosen,
-                                     max_length=max_length,
-                                     padding="max_length",
-                                     truncation=True,
-                                     return_tensors="pt")
-            self.chosen.append({
-                "input_ids": chosen_token['input_ids'],
-                "attention_mask": chosen_token['attention_mask']
-            })
-
-            reject = prompt + data['rejected'] + self.end_token
-            reject_token = tokenizer(reject,
-                                     max_length=max_length,
-                                     padding="max_length",
-                                     truncation=True,
-                                     return_tensors="pt")
-            self.reject.append({
-                "input_ids": reject_token['input_ids'],
-                "attention_mask": reject_token['attention_mask']
-            })
+            prompt = "提问：" + data["prompt"] + " 回答："
+
+            chosen = prompt + data["chosen"] + self.end_token
+            chosen_token = tokenizer(
+                chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+            )
+            self.chosen.append(
+                {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
+            )
+
+            reject = prompt + data["rejected"] + self.end_token
+            reject_token = tokenizer(
+                reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+            )
+            self.reject.append(
+                {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
+            )

    def __len__(self):
        length = len(self.chosen)
        return length

    def __getitem__(self, idx):
-        return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
-            "input_ids"], self.reject[idx]["attention_mask"]
-
-    #python representation of the object and the string representation of the object
+        return (
+            self.chosen[idx]["input_ids"],
+            self.chosen[idx]["attention_mask"],
+            self.reject[idx]["input_ids"],
+            self.reject[idx]["attention_mask"],
+        )
+
+    # python representation of the object and the string representation of the object
    def __repr__(self):
        return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"

@@ -165,26 +163,25 @@ class EasyRewardDataset(Dataset):
        return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"


-'''
+"""
 Easy SFT just accept a text file which can be read line by line. However the datasets will group texts together to max_length so LLM will learn the texts meaning better.
 If individual lines are not related, just set is_group_texts to False.
-'''
+"""


 class EasySFTDataset(Dataset):
-
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
        super().__init__()
-        #read the data_file line by line
+        # read the data_file line by line
        with open(data_file, "r", encoding="UTF-8") as f:
-            #encode the text data line by line and put raw python list input_ids only to raw_input_ids list
+            # encode the text data line by line and put raw python list input_ids only to raw_input_ids list
            raw_input_ids = []
            for line in f:
                encoded_ids = tokenizer.encode(line)
-                #if the encoded_ids is longer than max_length, then split it into several parts
+                # if the encoded_ids is longer than max_length, then split it into several parts
                if len(encoded_ids) > max_length:
                    for i in range(0, len(encoded_ids), max_length):
-                        raw_input_ids.append(encoded_ids[i:i + max_length])
+                        raw_input_ids.append(encoded_ids[i : i + max_length])
                else:
                    raw_input_ids.append(encoded_ids)

@@ -196,12 +193,13 @@ class EasySFTDataset(Dataset):
        if is_group_texts:
            for input_ids in raw_input_ids:
                if len(current_input_ids) + len(input_ids) > max_length:
-                    #pad the current_input_ids to max_length with tokenizer.pad_token_id
+                    # pad the current_input_ids to max_length with tokenizer.pad_token_id
                    padded_length = max_length - len(current_input_ids)
                    current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
                    grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
                    attention_mask.append(
-                        torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
+                        torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
+                    )
                    current_input_ids = []
                else:
                    current_input_ids.extend(input_ids)
@@ -210,14 +208,16 @@ class EasySFTDataset(Dataset):
                current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
                grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
                attention_mask.append(
-                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
+                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
+                )
        else:
-            #just append the raw_input_ids to max_length
+            # just append the raw_input_ids to max_length
            for input_ids in raw_input_ids:
                padded_length = max_length - len(input_ids)
                input_ids.extend([tokenizer.pad_token_id] * padded_length)
                attention_mask.append(
-                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
+                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
+                )
                grouped_input_ids.append(torch.tensor(input_ids, dtype=torch.long))
        self.input_ids = grouped_input_ids
        self.labels = copy.deepcopy(self.input_ids)
@@ -227,14 +227,14 @@ class EasySFTDataset(Dataset):
    def __len__(self):
        return len(self.input_ids)

-    #get item from dataset
+    # get item from dataset
    def __getitem__(self, idx):
        return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])

-    #generate the dataset description to be printed by print in python
+    # generate the dataset description to be printed by print in python
    def __repr__(self):
        return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"

-    #generate the dataset description to be printed by print in python
+    # generate the dataset description to be printed by print in python
    def __str__(self):
        return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
--- a/applications/Chat/examples/community/peft/easy_models.py
+++ b/applications/Chat/examples/community/peft/easy_models.py
@@ -4,7 +4,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from coati.models.generation import generate
-from coati.models.utils import log_probs_from_logits, masked_mean
+from coati.models.utils import log_probs_from_logits
 from peft import PeftModel
 from torch.nn.modules import Module
 from transformers import BloomConfig, BloomForCausalLM
@@ -24,38 +24,33 @@ class Actor(Module):

    @torch.no_grad()
    def generate(
-        self,
-        input_ids: torch.Tensor,
-        return_action_mask: bool = True,
-        **kwargs
+        self, input_ids: torch.Tensor, return_action_mask: bool = True, **kwargs
    ) -> Union[Tuple[torch.LongTensor, torch.LongTensor], Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]:
        sequences = generate(self.model, input_ids, **kwargs)
        attention_mask = None
-        pad_token_id = kwargs.get('pad_token_id', None)
+        pad_token_id = kwargs.get("pad_token_id", None)
        if pad_token_id is not None:
            attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
        if not return_action_mask:
            return sequences, attention_mask, None
        input_len = input_ids.size(1)
-        eos_token_id = kwargs.get('eos_token_id', None)
+        eos_token_id = kwargs.get("eos_token_id", None)
        if eos_token_id is None:
            action_mask = torch.ones_like(sequences, dtype=torch.bool)
        else:
            # left padding may be applied, only mask action
            action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
-            action_mask = F.pad(action_mask, (1 + input_len, -1), value=True)    # include eos token and input
+            action_mask = F.pad(action_mask, (1 + input_len, -1), value=True)  # include eos token and input
        action_mask[:, :input_len] = False
        action_mask = action_mask[:, 1:]
-        return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len):]
+        return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len) :]

-    def forward(self,
-                sequences: torch.LongTensor,
-                num_actions: int,
-                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """Returns action log probs
-        """
+    def forward(
+        self, sequences: torch.LongTensor, num_actions: int, attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Returns action log probs"""
        output = self.model(sequences, attention_mask=attention_mask)
-        logits = output['logits']
+        logits = output["logits"]
        log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
        return log_probs[:, -num_actions:]

@@ -75,11 +70,13 @@ class BLOOMActor(Actor):
        lora_train_bias (str): LoRA bias training mode.
    """

-    def __init__(self,
-                 pretrained: str = None,
-                 config: Optional[BloomConfig] = None,
-                 checkpoint: bool = False,
-                 lora_path: str = None) -> None:
+    def __init__(
+        self,
+        pretrained: str = None,
+        config: Optional[BloomConfig] = None,
+        checkpoint: bool = False,
+        lora_path: str = None,
+    ) -> None:
        if pretrained is not None:
            model = BloomForCausalLM.from_pretrained(pretrained)
        elif config is not None:

--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
 import argparse

-import pandas as pd
 import torch
 import torch.distributed as dist
-from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
+from coati.dataset import DataCollatorForSupervisedDataset
 from coati.models.bloom import BLOOMRM, BLOOMCritic
-from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
-from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.models.gpt import GPTRM, GPTCritic
+from coati.models.llama import LlamaCritic, LlamaRM
+from coati.models.opt import OPTRM, OPTCritic
 from coati.trainer import PPOTrainer
 from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
 from easy_models import BLOOMActor
-from peft import PeftModel
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
@@ -23,24 +21,24 @@ from colossalai.nn.optimizer import HybridAdam

 def main(args):
    # configure strategy
-    if args.strategy == 'ddp':
+    if args.strategy == "ddp":
        strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
-    elif args.strategy == 'colossalai_zero2':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
+    elif args.strategy == "colossalai_gemini":
+        strategy = GeminiStrategy(placement_policy="cpu", initial_scale=2**5)
+    elif args.strategy == "colossalai_zero2":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
    else:
        raise ValueError(f'Unsupported strategy "{args.strategy}"')

    if args.rm_path is not None:
-        state_dict = torch.load(args.rm_path, map_location='cpu')
+        state_dict = torch.load(args.rm_path, map_location="cpu")

    # configure model
-    if args.model == 'bloom':
+    if args.model == "bloom":
        # initial_model = BLOOMActor(pretrained=args.pretrain)
-        print('Using peft lora to load Bloom model as initial_model')
+        print("Using peft lora to load Bloom model as initial_model")
        initial_model = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
-        print('Using peft lora to load Bloom model as initial_model (Done)')
+        print("Using peft lora to load Bloom model as initial_model (Done)")
    else:
        raise ValueError(f'Unsupported actor model "{args.model}"')

@@ -49,59 +47,59 @@ def main(args):
    else:
        rm_model_name = args.rm_model

-    if rm_model_name == 'gpt2':
+    if rm_model_name == "gpt2":
        reward_model = GPTRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'bloom':
+    elif rm_model_name == "bloom":
        print("load bloom reward model ", args.rm_pretrain)
        reward_model = BLOOMRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'opt':
+    elif rm_model_name == "opt":
        reward_model = OPTRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'llama':
+    elif rm_model_name == "llama":
        reward_model = LlamaRM(pretrained=args.rm_pretrain)
    else:
        raise ValueError(f'Unsupported reward model "{rm_model_name}"')

    if args.rm_path is not None:
-        print('Loading reward model from', args.rm_path)
+        print("Loading reward model from", args.rm_path)
        reward_model.load_state_dict(state_dict)

-    if args.strategy != 'colossalai_gemini':
+    if args.strategy != "colossalai_gemini":
        initial_model.to(torch.float16).to(torch.cuda.current_device())
        reward_model.to(torch.float16).to(torch.cuda.current_device())

    with strategy.model_init_context():
-        if args.model == 'bloom':
+        if args.model == "bloom":
            # actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
-            print('Using peft lora to load Bloom model as Actor')
+            print("Using peft lora to load Bloom model as Actor")
            actor = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
-            print('Using peft lora to load Bloom model as Actor (Done)')
+            print("Using peft lora to load Bloom model as Actor (Done)")
        else:
            raise ValueError(f'Unsupported actor model "{args.model}"')

-        if rm_model_name == 'gpt2':
+        if rm_model_name == "gpt2":
            critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
-        elif rm_model_name == 'bloom':
+        elif rm_model_name == "bloom":
            print("load bloom critic ", args.rm_pretrain, " lora_rank ", args.lora_rank, " use_action_mask ", True)
            critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
            print("load bloom critic (Done) ")
-        elif rm_model_name == 'opt':
+        elif rm_model_name == "opt":
            critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
-        elif rm_model_name == 'llama':
+        elif rm_model_name == "llama":
            critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
        else:
            raise ValueError(f'Unsupported reward model "{rm_model_name}"')

        if args.rm_path is not None:
-            print('Loading reward model from', args.rm_path)
+            print("Loading reward model from", args.rm_path)
            critic.load_state_dict(state_dict)
            del state_dict

-    if args.strategy != 'colossalai_gemini':
+    if args.strategy != "colossalai_gemini":
        critic.to(torch.float16).to(torch.cuda.current_device())
        actor.to(torch.float16).to(torch.cuda.current_device())

    # configure optimizer
-    if args.strategy.startswith('colossalai'):
+    if args.strategy.startswith("colossalai"):
        actor_optim = HybridAdam(actor.parameters(), lr=1e-7)
        critic_optim = HybridAdam(critic.parameters(), lr=1e-7)
    else:
@@ -109,18 +107,18 @@ def main(args):
        critic_optim = Adam(critic.parameters(), lr=1e-7)

    # configure tokenizer
-    if args.model == 'gpt2':
+    if args.model == "gpt2":
        tokenizer = GPT2Tokenizer.from_pretrained(args.rm_pretrain)
        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
+    elif args.model == "bloom":
        tokenizer = BloomTokenizerFast.from_pretrained(args.rm_pretrain)
        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
+    elif args.model == "opt":
        tokenizer = AutoTokenizer.from_pretrained(args.rm_pretrain)
        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'llama':
+    elif args.model == "llama":
        tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
-        tokenizer.eos_token = '<\s>'
+        tokenizer.eos_token = "<\s>"
        tokenizer.pad_token = tokenizer.unk_token
    else:
        raise ValueError(f'Unsupported model "{args.model}"')
@@ -132,26 +130,27 @@ def main(args):
        prompt_sampler = DistributedSampler(prompt_dataset, shuffle=True, seed=42, drop_last=True)
    else:
        prompt_sampler = None
-    prompt_dataloader = DataLoader(prompt_dataset,
-                                   shuffle=(prompt_sampler is None),
-                                   sampler=prompt_sampler,
-                                   batch_size=args.train_batch_size)
+    prompt_dataloader = DataLoader(
+        prompt_dataset, shuffle=(prompt_sampler is None), sampler=prompt_sampler, batch_size=args.train_batch_size
+    )

    pretrain_dataset = EasySupervisedDataset(args.pretrain_dataset, tokenizer)
    if dist.is_initialized() and dist.get_world_size() > 1:
        pretrain_sampler = DistributedSampler(pretrain_dataset, shuffle=True, seed=42, drop_last=True)
    else:
        pretrain_sampler = None
-    pretrain_dataloader = DataLoader(pretrain_dataset,
-                                     shuffle=(pretrain_sampler is None),
-                                     sampler=pretrain_sampler,
-                                     batch_size=args.ptx_batch_size,
-                                     collate_fn=data_collator)
+    pretrain_dataloader = DataLoader(
+        pretrain_dataset,
+        shuffle=(pretrain_sampler is None),
+        sampler=pretrain_sampler,
+        batch_size=args.ptx_batch_size,
+        collate_fn=data_collator,
+    )

    def tokenize_fn(texts):
        # MUST padding to max length to ensure inputs of all ranks have the same length
        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+        batch = tokenizer(texts, return_tensors="pt", max_length=96, padding="max_length", truncation=True)
        return {k: v.to(torch.cuda.current_device()) for k, v in batch.items()}

    (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
@@ -178,45 +177,46 @@ def main(args):
        eos_token_id=tokenizer.eos_token_id,
    )

-    trainer.fit(prompt_dataloader=prompt_dataloader,
-                pretrain_dataloader=pretrain_dataloader,
-                num_episodes=args.num_episodes,
-                num_update_steps=args.num_update_steps,
-                num_collect_steps=args.num_collect_steps)
+    trainer.fit(
+        prompt_dataloader=prompt_dataloader,
+        pretrain_dataloader=pretrain_dataloader,
+        num_episodes=args.num_episodes,
+        num_update_steps=args.num_update_steps,
+        num_collect_steps=args.num_collect_steps,
+    )

    # save model checkpoint after fitting
    trainer.save_model(args.save_path, only_rank0=True, tokenizer=tokenizer)
    # save optimizer checkpoint on all ranks
    if args.need_optim_ckpt:
-        strategy.save_optimizer(actor_optim,
-                                'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                only_rank0=False)
+        strategy.save_optimizer(
+            actor_optim, "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()), only_rank0=False
+        )


-if __name__ == '__main__':
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('--prompt_path', type=str, default=None, help='path to the prompt dataset')
-    parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset')
-    parser.add_argument('--strategy',
-                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='ddp',
-                        help='strategy to use')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--sft_lora_path', type=str, default=None)
-    parser.add_argument('--rm_model', default=None, choices=['gpt2', 'bloom', 'opt', 'llama'])
-    parser.add_argument('--rm_path', type=str, default=None)
-    parser.add_argument('--rm_pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--num_collect_steps', type=int, default=10)
-    parser.add_argument('--num_update_steps', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=2)
-    parser.add_argument('--ptx_batch_size', type=int, default=1)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument('--kl_coef', type=float, default=0.1)
-    parser.add_argument('--ptx_coef', type=float, default=0.9)
+    parser.add_argument("--prompt_path", type=str, default=None, help="path to the prompt dataset")
+    parser.add_argument("--pretrain_dataset", type=str, default=None, help="path to the pretrained dataset")
+    parser.add_argument(
+        "--strategy", choices=["ddp", "colossalai_gemini", "colossalai_zero2"], default="ddp", help="strategy to use"
+    )
+    parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--sft_lora_path", type=str, default=None)
+    parser.add_argument("--rm_model", default=None, choices=["gpt2", "bloom", "opt", "llama"])
+    parser.add_argument("--rm_path", type=str, default=None)
+    parser.add_argument("--rm_pretrain", type=str, default=None)
+    parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
+    parser.add_argument("--need_optim_ckpt", type=bool, default=False)
+    parser.add_argument("--num_episodes", type=int, default=10)
+    parser.add_argument("--num_collect_steps", type=int, default=10)
+    parser.add_argument("--num_update_steps", type=int, default=5)
+    parser.add_argument("--train_batch_size", type=int, default=2)
+    parser.add_argument("--ptx_batch_size", type=int, default=1)
+    parser.add_argument("--experience_batch_size", type=int, default=8)
+    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+    parser.add_argument("--kl_coef", type=float, default=0.1)
+    parser.add_argument("--ptx_coef", type=float, default=0.9)
    args = parser.parse_args()
    main(args)