Unverified Commit 079bf3cb authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
parent 3c6b831c
......@@ -39,8 +39,9 @@ class SFTTrainer(SLTrainer):
accumulation_steps: int = 8,
) -> None:
if accumulation_steps > 1:
assert not isinstance(strategy, GeminiStrategy), \
"Accumulation steps are not supported in stage 3 of ColossalAI"
assert not isinstance(
strategy, GeminiStrategy
), "Accumulation steps are not supported in stage 3 of ColossalAI"
super().__init__(strategy, max_epochs, model, optim)
......@@ -50,15 +51,11 @@ class SFTTrainer(SLTrainer):
def _train(self, epoch: int):
self.model.train()
for batch_id, batch in enumerate(self.train_dataloader):
batch = to_device(batch, torch.cuda.current_device())
if "attention_mask" in batch:
outputs = self.model(batch["input_ids"],
attention_mask=batch["attention_mask"],
labels=batch["labels"])
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
else:
outputs = self.model(batch["input_ids"],
labels=batch["labels"])
outputs = self.model(batch["input_ids"], labels=batch["labels"])
loss = outputs.loss
loss = loss / self.accumulation_steps
......@@ -73,12 +70,14 @@ class SFTTrainer(SLTrainer):
self.optimizer.zero_grad()
self.scheduler.step()
if is_rank_0() and self.use_wandb:
wandb.log({
"loss": self.total_loss / self.accumulation_steps,
"lr": self.scheduler.get_last_lr()[0],
"epoch": epoch,
"batch_id": batch_id
})
wandb.log(
{
"loss": self.total_loss / self.accumulation_steps,
"lr": self.scheduler.get_last_lr()[0],
"epoch": epoch,
"batch_id": batch_id,
}
)
self.total_loss = 0
self.step_bar.update()
......@@ -89,9 +88,9 @@ class SFTTrainer(SLTrainer):
loss_sum, num_seen = 0, 0
for batch in self.eval_dataloader:
batch = to_device(batch, torch.cuda.current_device())
outputs = self.model(batch["input_ids"],
attention_mask=batch["attention_mask"],
labels=batch["labels"])
outputs = self.model(
batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"]
)
loss = outputs.loss
loss_sum += loss.item()
......@@ -99,13 +98,15 @@ class SFTTrainer(SLTrainer):
loss_mean = loss_sum / num_seen
if dist.get_rank() == 0:
self.logger.info(f'Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}')
self.logger.info(f"Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}")
def _before_fit(self,
train_dataloader: DataLoader,
eval_dataloader: Optional[DataLoader] = None,
logger: Optional[DistributedLogger] = None,
use_wandb: bool = False):
def _before_fit(
self,
train_dataloader: DataLoader,
eval_dataloader: Optional[DataLoader] = None,
logger: Optional[DistributedLogger] = None,
use_wandb: bool = False,
):
"""
Args:
train_dataloader: the dataloader to use for training
......@@ -124,6 +125,6 @@ class SFTTrainer(SLTrainer):
self.no_epoch_bar = True
self.step_bar = tqdm.trange(
len(self.train_dataloader) // self.accumulation_steps * self.max_epochs,
desc=f'steps',
disable=not is_rank_0()
desc=f"steps",
disable=not is_rank_0(),
)
......@@ -2,7 +2,4 @@ from .base import Strategy
from .colossalai import GeminiStrategy, LowLevelZeroStrategy
from .ddp import DDPStrategy
__all__ = [
'Strategy', 'DDPStrategy',
'LowLevelZeroStrategy', 'GeminiStrategy'
]
__all__ = ["Strategy", "DDPStrategy", "LowLevelZeroStrategy", "GeminiStrategy"]
......@@ -19,7 +19,7 @@ _BoostArgSpec = Union[nn.Module, Tuple[nn.Module, Optimizer], Dict]
class Strategy(ABC):
"""
Base class for training strategies.
Base class for training strategies.
"""
def __init__(self, plugin_initializer: Callable[..., Optional[Plugin]] = lambda: None) -> None:
......@@ -83,16 +83,18 @@ class Strategy(ABC):
rets.append((model, optimizer))
elif isinstance(arg, Dict):
model, optimizer, criterion, dataloader, lr_scheduler = self.booster.boost(**arg)
boost_result = dict(model=model,
optimizer=optimizer,
criterion=criterion,
dataloader=dataloader,
lr_scheduler=lr_scheduler)
boost_result = dict(
model=model,
optimizer=optimizer,
criterion=criterion,
dataloader=dataloader,
lr_scheduler=lr_scheduler,
)
# remove None values
boost_result = {key: value for key, value in boost_result.items() if value is not None}
rets.append(boost_result)
else:
raise RuntimeError(f'Type {type(arg)} is not supported')
raise RuntimeError(f"Type {type(arg)} is not supported")
return rets[0] if len(rets) == 1 else rets
......@@ -125,11 +127,9 @@ class Strategy(ABC):
return DistributedSampler(dataset, 1, 0)
@abstractmethod
def save_pretrained(self,
model: nn.Module,
path: str,
only_rank0: bool = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
def save_pretrained(
self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
) -> None:
pass
@abstractmethod
......
......@@ -42,27 +42,27 @@ class LowLevelZeroStrategy(DDPStrategy):
"""
def __init__(self,
stage: int = 2,
precision: str = 'fp16',
seed: int = 42,
placement_policy: str = 'cuda',
reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2
overlap_communication: bool = True, # only for stage 1&2
initial_scale: float = 2**16,
growth_factor: float = 2,
backoff_factor: float = 0.5,
growth_interval: int = 1000,
hysteresis: int = 2,
min_scale: float = 1,
max_scale: float = 2**32,
max_norm: float = 0.0,
norm_type: float = 2.0
) -> None:
def __init__(
self,
stage: int = 2,
precision: str = "fp16",
seed: int = 42,
placement_policy: str = "cuda",
reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2
overlap_communication: bool = True, # only for stage 1&2
initial_scale: float = 2**16,
growth_factor: float = 2,
backoff_factor: float = 0.5,
growth_interval: int = 1000,
hysteresis: int = 2,
min_scale: float = 1,
max_scale: float = 2**32,
max_norm: float = 0.0,
norm_type: float = 2.0,
) -> None:
assert stage in (1, 2), f'Unsupported stage "{stage}"'
assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
assert precision in ("fp32", "fp16"), f'Unsupported precision "{precision}"'
plugin_initializer = lambda: LowLevelZeroPlugin(
# zero_config
......@@ -71,7 +71,7 @@ class LowLevelZeroStrategy(DDPStrategy):
# zero_optim_config
reduce_bucket_size_in_m=reduce_bucket_size,
overlap_communication=overlap_communication,
cpu_offload=(placement_policy == 'cpu'),
cpu_offload=(placement_policy == "cpu"),
# optim_config
initial_scale=initial_scale,
growth_factor=growth_factor,
......@@ -81,14 +81,15 @@ class LowLevelZeroStrategy(DDPStrategy):
min_scale=min_scale,
max_scale=max_scale,
max_norm=max_norm,
norm_type=norm_type
norm_type=norm_type,
)
super().__init__(seed, plugin_initializer)
def _post_init(self) -> None:
assert isinstance(self.plugin, LowLevelZeroPlugin), \
f'{type(self).__name__}\'s plugin is not initialized properly.'
assert isinstance(
self.plugin, LowLevelZeroPlugin
), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
colossalai.launch_from_torch({}, seed=self.seed)
......@@ -131,45 +132,45 @@ class GeminiStrategy(DDPStrategy):
"""
def __init__(self,
seed: int = 42,
shard_init: bool = False, # only for stage 3
placement_policy: str = 'cuda',
pin_memory: bool = True, # only for stage 3
force_outputs_fp32: bool = False, # only for stage 3
search_range_m: int = 32, # only for stage 3
hidden_dim: Optional[int] = None, # only for stage 3
min_chunk_size_m: float = 32, # only for stage 3
gpu_margin_mem_ratio: float = 0.0, # only for stage 3
initial_scale: float = 2**16,
growth_factor: float = 2,
backoff_factor: float = 0.5,
growth_interval: int = 1000,
hysteresis: int = 2,
min_scale: float = 1,
max_scale: float = 2**32,
max_norm: float = 0.0,
norm_type: float = 2.0
) -> None:
assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
def __init__(
self,
seed: int = 42,
shard_init: bool = False, # only for stage 3
placement_policy: str = "cuda",
pin_memory: bool = True, # only for stage 3
force_outputs_fp32: bool = False, # only for stage 3
search_range_m: int = 32, # only for stage 3
hidden_dim: Optional[int] = None, # only for stage 3
min_chunk_size_m: float = 32, # only for stage 3
gpu_margin_mem_ratio: float = 0.0, # only for stage 3
initial_scale: float = 2**16,
growth_factor: float = 2,
backoff_factor: float = 0.5,
growth_interval: int = 1000,
hysteresis: int = 2,
min_scale: float = 1,
max_scale: float = 2**32,
max_norm: float = 0.0,
norm_type: float = 2.0,
) -> None:
assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
# TODO(ver217): support shard_init when using from_pretrained()
if shard_init:
warnings.warn(
f'Shard init is not supported model.from_pretrained() yet. '
'Please load weights after strategy.prepare()'
f"Shard init is not supported model.from_pretrained() yet. "
"Please load weights after strategy.prepare()"
)
self.shard_init = shard_init
warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")
# NOTE: dist should be initialized before calling get_current_device()
plugin_initializer = lambda: GeminiPlugin(
# gemini_config
device=get_current_device(),
placement_policy=placement_policy,
precision='fp16',
precision="fp16",
pin_memory=pin_memory,
force_outputs_fp32=force_outputs_fp32,
strict_ddp_mode=shard_init,
......@@ -187,14 +188,13 @@ class GeminiStrategy(DDPStrategy):
min_scale=min_scale,
max_scale=max_scale,
max_norm=max_norm,
norm_type=norm_type
norm_type=norm_type,
)
super().__init__(seed, plugin_initializer)
def _post_init(self) -> None:
assert isinstance(self.plugin, GeminiPlugin), \
f'{type(self).__name__}\'s plugin is not initialized properly.'
assert isinstance(self.plugin, GeminiPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
colossalai.launch_from_torch({}, seed=self.seed)
......@@ -203,10 +203,9 @@ class GeminiStrategy(DDPStrategy):
world_size = dist.get_world_size()
shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
return ColoInitContext(device=get_current_device(),
dtype=torch.half,
default_pg=shard_pg,
default_dist_spec=default_dist_spec)
return ColoInitContext(
device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec
)
def unwrap_model(self, model: nn.Module) -> nn.Module:
assert isinstance(model, GeminiModel)
......
......@@ -31,24 +31,21 @@ def get_grad_required_state_dict(model: nn.Module):
class DDPStrategy(Strategy):
"""
Strategy for distributed training using torch.distributed.
Strategy for distributed training using torch.distributed.
"""
def __init__(self,
seed: int = 42,
plugin_initializer: Callable = TorchDDPPlugin
) -> None:
def __init__(self, seed: int = 42, plugin_initializer: Callable = TorchDDPPlugin) -> None:
self.seed = seed
super().__init__(plugin_initializer)
def _try_init_dist(self, force: bool = False) -> None:
try:
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
host = os.environ['MASTER_ADDR']
port = int(os.environ['MASTER_PORT'])
dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
host = os.environ["MASTER_ADDR"]
port = int(os.environ["MASTER_PORT"])
dist.init_process_group("nccl", init_method=f"tcp://[{host}]:{port}", world_size=world_size, rank=rank)
torch.cuda.set_device(local_rank)
except KeyError as e:
if force:
......@@ -60,8 +57,7 @@ class DDPStrategy(Strategy):
raise e
def _post_init(self) -> None:
assert isinstance(self.plugin, TorchDDPPlugin), \
f'{type(self).__name__}\'s plugin is not initialized properly.'
assert isinstance(self.plugin, TorchDDPPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
self._try_init_dist(force=True)
......@@ -73,12 +69,14 @@ class DDPStrategy(Strategy):
torch.manual_seed(seed)
def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = False) -> DataLoader:
return self.plugin.prepare_dataloader(data_buffer,
batch_size=data_buffer.sample_batch_size,
shuffle=True,
drop_last=True,
pin_memory=pin_memory,
collate_fn=data_buffer.collate_fn)
return self.plugin.prepare_dataloader(
data_buffer,
batch_size=data_buffer.sample_batch_size,
shuffle=True,
drop_last=True,
pin_memory=pin_memory,
collate_fn=data_buffer.collate_fn,
)
def setup_sampler(self, dataset) -> DistributedSampler:
# FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
......@@ -88,11 +86,9 @@ class DDPStrategy(Strategy):
assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel."
return model.unwrap()
def save_pretrained(self,
model: nn.Module,
path: str,
only_rank0: bool = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
def save_pretrained(
self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
) -> None:
if not only_rank0 or dist.get_rank() == 0:
unwrapped_model = self.unwrap_model(model)
assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
......@@ -103,17 +99,11 @@ class DDPStrategy(Strategy):
if tokenizer is not None:
tokenizer.save_pretrained(path)
model_path = os.path.join(path, "pytorch_model.bin")
self.save_model(model,
model_path,
only_rank0=only_rank0)
self.save_model(model, model_path, only_rank0=only_rank0)
def _replace_keys(model_path: str,
replace_fn: Callable):
def _replace_keys(model_path: str, replace_fn: Callable):
state_dict = torch.load(model_path, map_location="cpu")
state_dict = {
replace_fn(k): v
for k, v in state_dict.items()
}
state_dict = {replace_fn(k): v for k, v in state_dict.items()}
torch.save(state_dict, model_path)
# FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
......@@ -124,13 +114,13 @@ class DDPStrategy(Strategy):
def get_model_state_dict_shard(self, model: nn.Module, **config):
# TODO: implement sharding on naive strategy
model = self.unwrap_model(model)
if 'requires_grad_only' in config and config['requires_grad_only'] == True:
if "requires_grad_only" in config and config["requires_grad_only"] == True:
state_dict = get_grad_required_state_dict(model)
else:
state_dict = model.state_dict()
if 'shard_size' in config:
shard_size = config['shard_size']
if "shard_size" in config:
shard_size = config["shard_size"]
accumulate_size = 0
state_dict_shard = OrderedDict()
for name, param in state_dict.items():
......
......@@ -4,7 +4,6 @@ import numpy as np
class DistributedSampler:
def __init__(self, dataset, num_replicas: int, rank: int) -> None:
self.dataset = dataset
self.num_replicas = num_replicas
......@@ -12,7 +11,7 @@ class DistributedSampler:
if len(self.dataset) % self.num_replicas != 0:
self.num_samples = math.ceil(
(len(self.dataset) - self.num_replicas) / self.num_replicas # type: ignore[arg-type]
(len(self.dataset) - self.num_replicas) / self.num_replicas # type: ignore[arg-type]
)
else:
self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
......@@ -20,10 +19,10 @@ class DistributedSampler:
self.total_size = self.num_samples * self.num_replicas
indices = list(range(len(self.dataset)))
indices = indices[:self.total_size]
indices = indices[: self.total_size]
assert len(indices) == self.total_size
# subsample
indices = indices[self.rank:self.total_size:self.num_replicas]
indices = indices[self.rank : self.total_size : self.num_replicas]
assert len(indices) == self.num_samples
self.indices = indices
......
......@@ -42,7 +42,6 @@ def is_rank_0() -> bool:
def to_device(x: Any, device: torch.device) -> Any:
def _to(t: Any):
if isinstance(t, torch.Tensor):
return t.to(device)
......
......@@ -70,7 +70,7 @@
"BLEU",
"ROUGE",
"BERTScore"
]
]
},
"logical_reasoning": {
"GPT": [
......@@ -83,7 +83,7 @@
"ROUGE",
"BERTScore",
"CHRF"
]
]
},
"open_qa": {
"GPT": [
......@@ -126,7 +126,7 @@
"conciseness"
],
"Metrics": [
]
]
},
"Finance": {
"GPT": [
......@@ -134,7 +134,7 @@
"correctness"
],
"Metrics": [
]
]
},
"Law": {
"GPT": [
......@@ -142,7 +142,7 @@
"correctness"
],
"Metrics": [
]
]
},
"Education": {
"GPT": [
......@@ -150,7 +150,7 @@
"correctness"
],
"Metrics": [
]
]
},
"Medical": {
"GPT": [
......@@ -158,7 +158,7 @@
"correctness"
],
"Metrics": [
]
]
},
"STEM": {
"GPT": [
......@@ -166,7 +166,7 @@
"correctness"
],
"Metrics": [
]
]
},
"SocialScience": {
"GPT": [
......@@ -174,7 +174,7 @@
"correctness"
],
"Metrics": [
]
]
},
"Humanity": {
"GPT": [
......@@ -182,7 +182,7 @@
"correctness"
],
"Metrics": [
]
]
},
"Other": {
"GPT": [
......@@ -190,7 +190,7 @@
"correctness"
],
"Metrics": [
]
]
},
"ethics": {
"GPT": [
......@@ -198,7 +198,7 @@
"correctness"
],
"Metrics": [
]
]
}
}
}
import argparse
import json
import os
import openai
......@@ -9,7 +8,8 @@ from utils import jload
def main(args):
assert len(args.answer_file_list) == len(
args.model_name_list), "The number of answer files and model names should be equal!"
args.model_name_list
), "The number of answer files and model names should be equal!"
# load config
config = jload(args.config_file)
......@@ -36,7 +36,8 @@ def main(args):
if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
raise Exception(
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
)
if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
raise Exception(
......@@ -44,8 +45,15 @@ def main(args):
)
# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"], config.get("path_for_UniEval", None), args.gpt_with_reference)
evaluator = Evaluator(
metrics_per_category,
battle_prompt,
gpt_evaluation_prompt,
args.gpt_model,
config["language"],
config.get("path_for_UniEval", None),
args.gpt_with_reference,
)
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])
......@@ -68,41 +76,41 @@ def main(args):
raise ValueError(f'Unsupported language {config["language"]}!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
parser.add_argument('--config_file',
type=str,
default=None,
required=True,
help='path to the file of target results')
parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
parser.add_argument('--gpt_evaluation_prompt_file',
type=str,
default=None,
help='path to the prompt file for gpt evaluation')
parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
parser.add_argument('--answer_file_list',
type=str,
nargs='+',
default=[],
required=True,
help='path to the answer files of at most 2 models')
parser.add_argument('--model_name_list',
type=str,
nargs='+',
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--gpt_model',
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--gpt_with_reference',
default=False,
action="store_true",
help='whether to include reference answer in gpt evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
parser.add_argument(
"--config_file", type=str, default=None, required=True, help="path to the file of target results"
)
parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
parser.add_argument(
"--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
)
parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
parser.add_argument(
"--answer_file_list",
type=str,
nargs="+",
default=[],
required=True,
help="path to the answer files of at most 2 models",
)
parser.add_argument(
"--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
)
parser.add_argument(
"--gpt_model",
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help="which GPT model to use for evaluation",
)
parser.add_argument(
"--gpt_with_reference",
default=False,
action="store_true",
help="whether to include reference answer in gpt evaluation",
)
parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
args = parser.parse_args()
if args.openai_key is not None:
......
......@@ -3,20 +3,27 @@ from typing import Any, Dict, List
import gpt_evaluate
import metrics
import pandas as pd
import unieval
from utils import analyze_automatic_results, get_data_per_category, save_automatic_results
class Evaluator(object):
"""
A class named Evaluator includes GPT-3.5/GPT-4 evaluation
and automatic evaluation
A class named Evaluator includes GPT-3.5/GPT-4 evaluation
and automatic evaluation
"""
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
gpt_model: str, language: str, path_for_UniEval: Dict[str, str], gpt_with_reference: bool) -> None:
def __init__(
self,
params: Dict[str, Any],
battle_prompt: Dict[str, Any],
gpt_evaluation_prompt: Dict[str, Any],
gpt_model: str,
language: str,
path_for_UniEval: Dict[str, str],
gpt_with_reference: bool,
) -> None:
self.params = params
self.battle_prompt = battle_prompt
self.gpt_evaluation_prompt = gpt_evaluation_prompt
......@@ -103,7 +110,8 @@ class Evaluator(object):
if self.params[category]["UniEval"] and self.language == "cn":
raise Exception(
"UniEval doesn't support Chinese! Please remove UniEval config in your Chinese config file.")
"UniEval doesn't support Chinese! Please remove UniEval config in your Chinese config file."
)
category_metrics = self.params[category]["UniEval"]
......@@ -134,10 +142,9 @@ class Evaluator(object):
sources_list = [answer["instruction"] + answer["input"] for answer in answers_per_category[category]]
data = unieval.convert_data_to_unieval_format(predicts_list, sources_list, targets_list)
scores = uni_evaluator.evaluate(data,
category,
dims=list(self.unieval_metric_stats[task][category].keys()),
overall=False)
scores = uni_evaluator.evaluate(
data, category, dims=list(self.unieval_metric_stats[task][category].keys()), overall=False
)
avg_scores = unieval.calculate_average_score(scores)
self.unieval_metric_stats[task][category].update(avg_scores)
......@@ -165,7 +172,8 @@ class Evaluator(object):
category,
self.gpt_model,
self.language,
references=targets_per_category[category] if self.gpt_with_reference else None)
references=targets_per_category[category] if self.gpt_with_reference else None,
)
def save(self, path: str, model_name_list: List[str]) -> None:
"""
......@@ -204,16 +212,18 @@ class Evaluator(object):
gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0],
self.gpt_evaluation_results,
gpt_evaluation_results_save_path)
all_evaluations = gpt_evaluate.save_gpt_evaluation_results(
model_name_list[0], self.gpt_evaluation_results, gpt_evaluation_results_save_path
)
# Start to calculate scores and save statistics.
gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
gpt_evaluation_statistics_save_path)
gpt_evaluate.save_gpt_evaluation_statistics(
model_name_list[0], all_evaluations, gpt_evaluation_statistics_save_path
)
# Save charts and csv.
gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
gpt_evaluation_analyses_save_path)
gpt_evaluate.analyze_gpt_evaluation_statistics(
gpt_evaluation_statistics_save_path, gpt_evaluation_analyses_save_path
)
......@@ -14,20 +14,18 @@ import tqdm
from utils import jdump, jload
ref_step_template = {
"en":
"Now please compare the answer with the {adjective} answer, determine whether the answer is able to achieve the same level of {metric}.\n\n",
"cn":
"请比较答案与上面的{adjective}答案,确定答案是否可以达到与该{adjective}答案同样水平的{metric}。\n\n"
"en": "Now please compare the answer with the {adjective} answer, determine whether the answer is able to achieve the same level of {metric}.\n\n",
"cn": "请比较答案与上面的{adjective}答案,确定答案是否可以达到与该{adjective}答案同样水平的{metric}。\n\n",
}
ref_answer_template_general = {
"en": "\nAn example answer with good quality is as follows:\n\n{answer}\n\n",
"cn": "\n一个优质的示例答案如下:\n\n{answer}\n\n"
"cn": "\n一个优质的示例答案如下:\n\n{answer}\n\n",
}
ref_answer_template_correctness = {
"en": "\nA correct answer is as follows:\n\n{answer}\n\n",
"cn": "\n标准答案如下:\n\n{answer}\n\n"
"cn": "\n标准答案如下:\n\n{answer}\n\n",
}
......@@ -51,10 +49,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": sys_prompt
},
{"role": "system", "content": sys_prompt},
{
"role": "user",
"content": user_prompt,
......@@ -106,7 +101,7 @@ def parse_battle_score(evaluation: str) -> List[float]:
return [float(sp[0]), float(sp[1])]
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
except Exception as e:
except Exception:
return [-1, -1]
......@@ -125,9 +120,6 @@ def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]
assert len(answer1) == len(answer2)
handles = []
evaluation_file = []
total_len = len(answer1)
question_idx_list = list(range(total_len))
......@@ -140,9 +132,12 @@ def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]
assert answer1[i]["id"] == answer2[i]["id"]
answer_id = answer1[i]["id"]
ques = answer1[i]["instruction"] if answer1[i][
"input"] == "" else answer1[i]["instruction"] + " " + answer1[i]["input"]
cat = answer1[i]["category"]
ques = (
answer1[i]["instruction"]
if answer1[i]["input"] == ""
else answer1[i]["instruction"] + " " + answer1[i]["input"]
)
answer1[i]["category"]
ans1 = answer1[i]["output"]
ans2 = answer2[i]["output"]
......@@ -267,7 +262,11 @@ def reference_template(metric: str, language: str, reference: Dict[str, Any]) ->
step_to_add = ref_step_template[language]
for_the_given_answer = "{metric} (1-5) (directly give the score for the given answer):" if language == "en" else "{metric} (1-5) (直接对给定答案打分)"
for_the_given_answer = (
"{metric} (1-5) (directly give the score for the given answer):"
if language == "en"
else "{metric} (1-5) (直接对给定答案打分)"
)
# adjective is used to describe the word "answer" in the prompt.
adjective = "example" if language == "en" else "示例"
......@@ -280,8 +279,9 @@ def reference_template(metric: str, language: str, reference: Dict[str, Any]) ->
answer_to_add = ref_answer_template_correctness[language]
answer_to_add = answer_to_add.format(answer=reference["target"] if reference["target"] else reference["output"])
step_to_add = step_to_add.format(metric=metric.lower(),
adjective=adjective) + for_the_given_answer.format(metric=metric)
step_to_add = step_to_add.format(metric=metric.lower(), adjective=adjective) + for_the_given_answer.format(
metric=metric
)
return answer_to_add + step_to_add
......@@ -329,7 +329,8 @@ def multiturn_chat_completion(user_messages: List[str], model: str, max_tokens:
for j in range(i):
messages_to_send.append(fill_in_message("user", user_messages[j]))
messages_to_send.append(
fill_in_message("assistant", assistant_responses[j]["choices"][0]["message"]["content"]))
fill_in_message("assistant", assistant_responses[j]["choices"][0]["message"]["content"])
)
# Length of user messages == Length of assistant messages + 1
# Because we always expect the api to response
......@@ -351,13 +352,15 @@ def multiturn_chat_completion(user_messages: List[str], model: str, max_tokens:
return assistant_responses[-1]
def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
language: str,
reference: Dict[str, Any] = None,
model: str = "gpt-3.5-turbo",
max_tokens: int = 2048) -> Dict[str, Any]:
def get_gpt_evaluation_without_logprobs(
prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
language: str,
reference: Dict[str, Any] = None,
model: str = "gpt-3.5-turbo",
max_tokens: int = 2048,
) -> Dict[str, Any]:
"""
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
......@@ -378,7 +381,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
MAX_API_RETRY = 3
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"])
question = inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"]
answer = inst["output"]
inst["evaluation"] = {}
......@@ -400,10 +403,9 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
if prompt_reference:
# Do a 2-round conversation
response = multiturn_chat_completion([prompt_1st_round, prompt_reference],
model,
max_tokens=max_tokens,
turns=2)
response = multiturn_chat_completion(
[prompt_1st_round, prompt_reference], model, max_tokens=max_tokens, turns=2
)
else:
response = multiturn_chat_completion([prompt_1st_round], model, max_tokens=max_tokens, turns=1)
......@@ -427,10 +429,9 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
return inst
def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
def get_gpt_evaluation_with_logprobs(
prompt: Dict[str, Any], inst: Dict[str, Any], metrics: List[str], max_tokens: int = 2048
) -> Dict[str, Any]:
"""
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.
......@@ -449,7 +450,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
MAX_API_RETRY = 3
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"])
question = inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"]
answer = inst["output"]
inst["evaluation"] = {}
......@@ -492,13 +493,15 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
return inst
def evaluate(answers: List[Dict],
prompt: Dict[str, Any],
metrics: List[str],
category: str,
model: str,
language: str,
references: List[Dict] = None) -> List[Dict]:
def evaluate(
answers: List[Dict],
prompt: Dict[str, Any],
metrics: List[str],
category: str,
model: str,
language: str,
references: List[Dict] = None,
) -> List[Dict]:
"""
Use GPT models to evaluate model answers and save evaluation results.
......@@ -529,21 +532,23 @@ def evaluate(answers: List[Dict],
if model == "text-davinci-003":
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
else:
future = executor.submit(get_gpt_evaluation_without_logprobs,
prompt,
inst,
metrics,
language,
reference=None if references is None else references[idx],
model=model,
max_tokens=1)
future = executor.submit(
get_gpt_evaluation_without_logprobs,
prompt,
inst,
metrics,
language,
reference=None if references is None else references[idx],
model=model,
max_tokens=1,
)
futures.append(future)
for future in tqdm.tqdm(
concurrent.futures.as_completed(futures),
desc=f"{category}: ",
total=len(futures),
concurrent.futures.as_completed(futures),
desc=f"{category}: ",
total=len(futures),
):
evaluations.append(future.result())
......@@ -610,12 +615,13 @@ def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) ->
return int(results[0])
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
except Exception as e:
except Exception:
return 0
def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
save_path: str) -> Dict[str, Any]:
def save_gpt_evaluation_results(
model_name: str, gpt_evaluation_results: Dict[str, Any], save_path: str
) -> Dict[str, Any]:
"""
Save evaluation results for different categories for one model.
......@@ -667,10 +673,12 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav
scores[metric].append(0)
elif evaluation["evaluation"][metric]["logprobs"] is not None:
scores[metric].append(
calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0])
)
else:
scores[metric].append(
calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation)
)
statistics = {}
for metric in metrics:
......@@ -751,9 +759,9 @@ def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> N
frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
for category in tqdm.tqdm(
frame_per_category.keys(),
desc=f"GPT evaluation: ",
total=len(frame_per_category.keys()),
frame_per_category.keys(),
desc=f"GPT evaluation: ",
total=len(frame_per_category.keys()),
):
data = pd.DataFrame(frame_per_category[category])
......
......@@ -21,13 +21,17 @@ def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str,
"""
bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
cumulative_bleu = [0] * 4
weights = [(1. / 1., 0., 0., 0.), (1. / 2., 1. / 2., 0., 0.), (1. / 3., 1. / 3., 1. / 3., 0.),
(1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
weights = [
(1.0 / 1.0, 0.0, 0.0, 0.0),
(1.0 / 2.0, 1.0 / 2.0, 0.0, 0.0),
(1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0, 0.0),
(1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0),
]
for pred, target in zip(preds, targets):
if language == "cn":
pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
target_list = [(' '.join(jieba.cut(preprocessing_text(target)))).split()]
pred_list = " ".join(jieba.cut(preprocessing_text(pred))).split()
target_list = [(" ".join(jieba.cut(preprocessing_text(target)))).split()]
elif language == "en":
pred_list = preprocessing_text(pred).split()
target_list = [preprocessing_text(target).split()]
......@@ -42,15 +46,14 @@ def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str,
def chrf_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate CHRF Score Metric in sentence level.
"""
"""Calculate CHRF Score Metric in sentence level."""
chrf_score = {"chrf": 0}
cumulative_chrf = []
for pred, target in zip(preds, targets):
if language == "cn":
pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
target_list = ' '.join(jieba.cut(preprocessing_text(target))).split()
pred_list = " ".join(jieba.cut(preprocessing_text(pred))).split()
target_list = " ".join(jieba.cut(preprocessing_text(target))).split()
elif language == "en":
pred_list = preprocessing_text(pred).split()
target_list = preprocessing_text(target).split()
......@@ -75,8 +78,8 @@ def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
all_targets = []
for pred, target in zip(preds, targets):
pred_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(pred))))
target_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(target))))
pred_list = remove_redundant_space(" ".join(jieba.cut(preprocessing_text(pred))))
target_list = remove_redundant_space(" ".join(jieba.cut(preprocessing_text(target))))
all_preds.append(pred_list)
all_targets.append(target_list)
......@@ -99,16 +102,14 @@ def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
longest common subsequence (LCS) between preds and targets.
"""
rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
all_preds = []
all_targets = []
rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
for pred, target in zip(preds, targets):
score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
rouge_scores["rouge1"] += score['rouge1'].fmeasure
rouge_scores["rouge2"] += score['rouge2'].fmeasure
rouge_scores["rougeL"] += score['rougeL'].fmeasure
rouge_scores["rouge1"] += score["rouge1"].fmeasure
rouge_scores["rouge2"] += score["rouge2"].fmeasure
rouge_scores["rougeL"] += score["rougeL"].fmeasure
rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
......@@ -137,7 +138,7 @@ def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
for pred in preds:
if language == "cn":
pred_seg_list = ' '.join(jieba.cut(pred)).split()
pred_seg_list = " ".join(jieba.cut(pred)).split()
count_segs = len(pred_seg_list)
unique_segs = set(pred_seg_list)
count_unique_chars = len(unique_segs)
......@@ -151,7 +152,7 @@ def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
split_pred = preprocessing_text(pred).split()
for n in range(0, 3):
for i in range(0, len(split_pred) - n):
ngram = ' '.join(split_pred[i:i + n + 1])
ngram = " ".join(split_pred[i : i + n + 1])
unique_ngram[n].add(ngram)
all_ngram_count[n] += 1
......@@ -203,8 +204,8 @@ def calculate_precision_recall_f1(preds: List[str], targets: List[str], language
for pred, target in zip(preds, targets):
if language == "cn":
pred_list = [char for char in ' '.join(jieba.cut(preprocessing_text(pred))).split()]
target_list = [char for char in ' '.join(jieba.cut(preprocessing_text(target))).split()]
pred_list = [char for char in " ".join(jieba.cut(preprocessing_text(pred))).split()]
target_list = [char for char in " ".join(jieba.cut(preprocessing_text(target))).split()]
elif language == "en":
pred_list = [char for char in preprocessing_text(pred).split()]
target_list = [char for char in preprocessing_text(target).split()]
......
......@@ -7,6 +7,9 @@ from .utils import (
)
__all__ = [
'get_evaluator', 'convert_data_to_unieval_format', 'calculate_average_score', 'save_unieval_results',
'analyze_unieval_results'
"get_evaluator",
"convert_data_to_unieval_format",
"calculate_average_score",
"save_unieval_results",
"analyze_unieval_results",
]
......@@ -28,29 +28,29 @@ from .utils import add_question
class SumEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up evaluator for text summarization """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for text summarization"""
self.scorer = UniEvaluator(
model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
self.task = 'summarization'
self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance']
cache_dir=cache_dir,
)
self.task = "summarization"
self.dimensions = ["coherence", "consistency", "fluency", "relevance"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
Get the scores of all the given dimensions
category: The category to be evaluated.
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
four dimensions: coherence, consistency, fluency, relevance.
dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
four dimensions: coherence, consistency, fluency, relevance.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
......@@ -63,12 +63,12 @@ class SumEvaluator:
for dim in eval_dims:
# Calculate average sentence-level scores for 'consistency' and 'fluency'
if dim == 'consistency' or dim == 'fluency':
if dim == "consistency" or dim == "fluency":
src_list, output_list = [], []
n_sents = [] # the number of sentences in each generated summary
n_sents = [] # the number of sentences in each generated summary
for i in range(n_data):
source = data[i]['source']
system_outputs = sent_tokenize(data[i]['system_output'])
source = data[i]["source"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
......@@ -81,24 +81,26 @@ class SumEvaluator:
score = []
for cur_n_sent in n_sents:
# prevent denominator from being 0
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
start_idx += cur_n_sent
# Calculate summary-level score for 'coherence' and 'relevance'
elif dim == 'coherence' or dim == 'relevance':
elif dim == "coherence" or dim == "relevance":
src_list, output_list, ref_list = [], [], []
for i in range(n_data):
src_list.append(data[i]['source'])
output_list.append(data[i]['system_output'])
if dim == 'relevance':
ref_list.append(data[i]['reference'])
src_list.append(data[i]["source"])
output_list.append(data[i]["system_output"])
if dim == "relevance":
ref_list.append(data[i]["reference"])
input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
score = self.scorer.score(input_list, self.task, category, dim)
# Please customize other dimensions here for summarization
else:
raise NotImplementedError('The input format for this dimension is still undefined. \
Please customize it first.')
raise NotImplementedError(
"The input format for this dimension is still undefined. \
Please customize it first."
)
for i in range(n_data):
eval_scores[i][dim] = score[i]
......@@ -106,35 +108,35 @@ class SumEvaluator:
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class DialogEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up evaluator for dialogues """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for dialogues"""
self.scorer = UniEvaluator(
model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path,
model_name_or_path="MingZhong/unieval-dialog" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
self.task = 'dialogue'
self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability']
cache_dir=cache_dir,
)
self.task = "dialogue"
self.dimensions = ["naturalness", "coherence", "engagingness", "groundedness", "understandability"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
Get the scores of all the given dimensions
category: The category to be evaluated.
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
......@@ -147,50 +149,48 @@ class DialogEvaluator:
for dim in eval_dims:
# Calculate summation score for 'engagingness'
if dim == 'engagingness':
if dim == "engagingness":
src_list, output_list, context_list = [], [], []
n_sents = [] # the number of sentences in each generated response
n_sents = [] # the number of sentences in each generated response
for i in range(n_data):
source = data[i]['source']
context = data[i]['context']
system_outputs = sent_tokenize(data[i]['system_output'])
source = data[i]["source"]
context = data[i]["context"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
context_list.append(context)
output_list.append(system_outputs[j])
input_list = add_question(dimension=dim,
output=output_list,
src=src_list,
context=context_list,
task=self.task)
input_list = add_question(
dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
)
sent_score = self.scorer.score(input_list, self.task, category, dim)
# Get the summation score for each sample
start_idx = 0
score = []
for cur_n_sent in n_sents:
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]))
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]))
start_idx += cur_n_sent
# Calculate turn-level score for other dimensions
elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']:
elif dim in ["naturalness", "coherence", "groundedness", "understandability"]:
src_list, output_list, context_list = [], [], []
for i in range(n_data):
src_list.append(data[i]['source'])
output_list.append(data[i]['system_output'])
context_list.append(data[i]['context'])
input_list = add_question(dimension=dim,
output=output_list,
src=src_list,
context=context_list,
task=self.task)
src_list.append(data[i]["source"])
output_list.append(data[i]["system_output"])
context_list.append(data[i]["context"])
input_list = add_question(
dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
)
score = self.scorer.score(input_list, self.task, category, dim)
# Please customize other dimensions here for summarization
else:
raise NotImplementedError('The input format for this dimension is still undefined. \
Please customize it first.')
raise NotImplementedError(
"The input format for this dimension is still undefined. \
Please customize it first."
)
for i in range(n_data):
eval_scores[i][dim] = score[i]
......@@ -198,35 +198,35 @@ class DialogEvaluator:
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class D2tEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up evaluator for data-to-text """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for data-to-text"""
self.scorer = UniEvaluator(
model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
self.task = 'data2text'
self.dimensions = ['naturalness', 'informativeness']
cache_dir=cache_dir,
)
self.task = "data2text"
self.dimensions = ["naturalness", "informativeness"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
Get the scores of all the given dimensions
category: The category to be evaluated.
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
two dimensions: naturalness and informativeness.
dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
two dimensions: naturalness and informativeness.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
......@@ -240,8 +240,8 @@ class D2tEvaluator:
for dim in eval_dims:
output_list, ref_list = [], []
for i in range(n_data):
output_list.append(data[i]['system_output'])
ref_list.append(data[i]['reference'])
output_list.append(data[i]["system_output"])
ref_list.append(data[i]["reference"])
input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
score = self.scorer.score(input_list, self.task, category, dim)
......@@ -252,38 +252,38 @@ class D2tEvaluator:
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class FactEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up evaluator for factual consistency detection """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for factual consistency detection"""
self.scorer = UniEvaluator(
model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path,
model_name_or_path="MingZhong/unieval-fact" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
self.task = 'fact'
self.dim = 'consistency'
cache_dir=cache_dir,
)
self.task = "fact"
self.dim = "consistency"
def evaluate(self, data, category):
"""
Get the factual consistency score (only 1 dimension for this task)
Get the factual consistency score (only 1 dimension for this task)
category: The category to be evaluated.
category: The category to be evaluated.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
# Calculate average sentence-level scores for factual consistency
src_list, output_list = [], []
n_sents = [] # the number of sentences in the claim
n_sents = [] # the number of sentences in the claim
for i in range(n_data):
source = data[i]['source']
system_outputs = sent_tokenize(data[i]['system_output'])
source = data[i]["source"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
......@@ -295,7 +295,7 @@ class FactEvaluator:
start_idx = 0
score = []
for cur_n_sent in n_sents:
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / cur_n_sent)
start_idx += cur_n_sent
for i in range(n_data):
......@@ -304,28 +304,26 @@ class FactEvaluator:
return eval_scores
def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None):
assert task in ['summarization', 'dialogue', 'data2text', 'fact']
if task == 'summarization':
return SumEvaluator(model_name_or_path=model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
elif task == 'dialogue':
return DialogEvaluator(model_name_or_path=model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
elif task == 'data2text':
return D2tEvaluator(model_name_or_path=model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
elif task == 'fact':
return FactEvaluator(model_name_or_path=model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir)
def get_evaluator(task, model_name_or_path="", max_length=1024, device="cuda:0", cache_dir=None):
assert task in ["summarization", "dialogue", "data2text", "fact"]
if task == "summarization":
return SumEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "dialogue":
return DialogEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "data2text":
return D2tEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "fact":
return FactEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
else:
raise NotImplementedError('Other tasks are not implemented, \
please customize specific tasks here.')
raise NotImplementedError(
"Other tasks are not implemented, \
please customize specific tasks here."
)
......@@ -27,9 +27,8 @@ from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
class UniEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
""" Set up model """
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up model"""
self.device = device
self.max_length = max_length
......@@ -47,8 +46,8 @@ class UniEvaluator:
def score(self, inputs, task, category, dim, batch_size=8):
"""
Get scores for the given samples.
final_score = postive_score / (postive_score + negative_score)
Get scores for the given samples.
final_score = postive_score / (postive_score + negative_score)
"""
# The implementation of "forward" in T5 still requires decoder_input_ids.
......@@ -58,31 +57,27 @@ class UniEvaluator:
pos_score_list, neg_score_list = [], []
for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "):
src_list = inputs[i:i + batch_size]
tgt_list = tgts[i:i + batch_size]
src_list = inputs[i : i + batch_size]
tgt_list = tgts[i : i + batch_size]
try:
with torch.no_grad():
encoded_src = self.tokenizer(src_list,
max_length=self.max_length,
truncation=True,
padding=True,
return_tensors='pt')
encoded_tgt = self.tokenizer(tgt_list,
max_length=self.max_length,
truncation=True,
padding=True,
return_tensors='pt')
src_tokens = encoded_src['input_ids'].to(self.device)
src_mask = encoded_src['attention_mask'].to(self.device)
tgt_tokens = encoded_tgt['input_ids'].to(self.device)[:, 0].unsqueeze(-1)
encoded_src = self.tokenizer(
src_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
)
encoded_tgt = self.tokenizer(
tgt_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
)
src_tokens = encoded_src["input_ids"].to(self.device)
src_mask = encoded_src["attention_mask"].to(self.device)
tgt_tokens = encoded_tgt["input_ids"].to(self.device)[:, 0].unsqueeze(-1)
output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens)
logits = output.logits.view(-1, self.model.config.vocab_size)
pos_score = self.softmax(logits)[:, self.pos_id] # Yes
neg_score = self.softmax(logits)[:, self.neg_id] # No
pos_score = self.softmax(logits)[:, self.pos_id] # Yes
neg_score = self.softmax(logits)[:, self.neg_id] # No
cur_pos_score = [x.item() for x in pos_score]
cur_neg_score = [x.item() for x in neg_score]
......@@ -90,8 +85,8 @@ class UniEvaluator:
neg_score_list += cur_neg_score
except RuntimeError:
print(f'source: {src_list}')
print(f'target: {tgt_list}')
print(f"source: {src_list}")
print(f"target: {tgt_list}")
exit(0)
score_list = []
......
......@@ -31,105 +31,142 @@ import tqdm
def add_question(dimension, output, src=None, ref=None, context=None, task=None):
"""
Add questions to generate input in Bool-QA format for UniEval.
dimension: specific dimension to be evaluated
src: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation.
output: output text generated by the models
ref: human-annotated groundtruth
context: the context needed to evaluate several specific dimension. For example,
additional factual information when evaluating engagingness and groundedness in dialogues.
Add questions to generate input in Bool-QA format for UniEval.
dimension: specific dimension to be evaluated
src: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation.
output: output text generated by the models
ref: human-annotated groundtruth
context: the context needed to evaluate several specific dimension. For example,
additional factual information when evaluating engagingness and groundedness in dialogues.
"""
input_with_question = []
for i in range(len(output)):
# For summarization
if task == 'summarization':
if dimension == 'fluency':
cur_input = 'question: Is this a fluent paragraph? </s> paragraph: ' + output[i]
elif dimension == 'coherence':
cur_input = 'question: Is this a coherent summary to the document? </s> summary: ' + output[
i] + ' </s> document: ' + src[i]
elif dimension == 'consistency':
cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
i] + ' </s> document: ' + src[i]
elif dimension == 'relevance':
cur_input = 'question: Is this summary relevant to the reference? </s> summary: ' + output[
i] + ' </s> reference: ' + ref[i]
if task == "summarization":
if dimension == "fluency":
cur_input = "question: Is this a fluent paragraph? </s> paragraph: " + output[i]
elif dimension == "coherence":
cur_input = (
"question: Is this a coherent summary to the document? </s> summary: "
+ output[i]
+ " </s> document: "
+ src[i]
)
elif dimension == "consistency":
cur_input = (
"question: Is this claim consistent with the document? </s> claim: "
+ output[i]
+ " </s> document: "
+ src[i]
)
elif dimension == "relevance":
cur_input = (
"question: Is this summary relevant to the reference? </s> summary: "
+ output[i]
+ " </s> reference: "
+ ref[i]
)
else:
raise NotImplementedError(
'The input format for this dimension is still undefined. Please customize it first.')
"The input format for this dimension is still undefined. Please customize it first."
)
# For dialogues
elif task == 'dialogue':
if dimension == 'naturalness':
cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + output[i]
elif dimension == 'coherence':
cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: '\
+ output[i] + ' </s> dialogue history: ' + src[i]
elif dimension == 'engagingness':
cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: '\
+ output[i] + ' </s> dialogue history: ' + src[i] + ' </s> fact: ' + context[i]
elif dimension == 'groundedness':
cur_input = 'question: Is this response consistent with knowledge in the fact? </s> response: '\
+ output[i] + ' </s> fact: ' + context[i]
elif dimension == 'understandability':
cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + output[i]
elif task == "dialogue":
if dimension == "naturalness":
cur_input = "question: Is this a natural response in the dialogue? </s> response: " + output[i]
elif dimension == "coherence":
cur_input = (
"question: Is this a coherent response given the dialogue history? </s> response: "
+ output[i]
+ " </s> dialogue history: "
+ src[i]
)
elif dimension == "engagingness":
cur_input = (
"question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: "
+ output[i]
+ " </s> dialogue history: "
+ src[i]
+ " </s> fact: "
+ context[i]
)
elif dimension == "groundedness":
cur_input = (
"question: Is this response consistent with knowledge in the fact? </s> response: "
+ output[i]
+ " </s> fact: "
+ context[i]
)
elif dimension == "understandability":
cur_input = "question: Is this an understandable response in the dialogue? </s> response: " + output[i]
else:
raise NotImplementedError(
'The input format for this dimension is still undefined. Please customize it first.')
"The input format for this dimension is still undefined. Please customize it first."
)
# For data-to-text
elif task == 'data2text':
if dimension == 'naturalness':
cur_input = 'question: Is this a fluent utterance? </s> utterance: ' + output[i]
elif dimension == 'informativeness':
cur_input = 'question: Is this sentence informative according to the reference? </s> sentence: '\
+ output[i] + ' </s> reference: ' + ref[i]
elif task == "data2text":
if dimension == "naturalness":
cur_input = "question: Is this a fluent utterance? </s> utterance: " + output[i]
elif dimension == "informativeness":
cur_input = (
"question: Is this sentence informative according to the reference? </s> sentence: "
+ output[i]
+ " </s> reference: "
+ ref[i]
)
else:
raise NotImplementedError(
'The input format for this dimension is still undefined. Please customize it first.')
"The input format for this dimension is still undefined. Please customize it first."
)
# For factual consistency detection
elif task == 'fact':
if dimension == 'consistency':
cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
i] + ' </s> document: ' + src[i]
elif task == "fact":
if dimension == "consistency":
cur_input = (
"question: Is this claim consistent with the document? </s> claim: "
+ output[i]
+ " </s> document: "
+ src[i]
)
else:
raise NotImplementedError('No other dimensions for the factual consistency detection task.')
raise NotImplementedError("No other dimensions for the factual consistency detection task.")
# For new customized tasks
else:
raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.')
raise NotImplementedError("Other tasks are not implemented, please customize specific tasks here.")
input_with_question.append(cur_input)
return input_with_question
def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None):
"""
Convert the data into the unieval's format.
Convert the data into the unieval's format.
output_list: a list of model output
output_list: a list of model output
src_list: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation
ref_list: human-annotated groundtruth
src_list: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation
ref_list: human-annotated groundtruth
"""
json_data = []
for i in range(len(output_list)):
cur = {}
cur['system_output'] = output_list[i]
cur["system_output"] = output_list[i]
if src_list is not None:
cur['source'] = src_list[i]
cur["source"] = src_list[i]
if ref_list is not None:
cur['reference'] = ref_list[i]
cur['context'] = ""
cur["reference"] = ref_list[i]
cur["context"] = ""
json_data.append(cur)
return json_data
def calculate_average_score(scores):
"""
Calculate average scores for different metrics
Calculate average scores for different metrics
scores: a list of scores for different metrics for each answer
scores: a list of scores for different metrics for each answer
"""
metrics = {metric: 0 for metric in scores[0]}
......@@ -226,9 +263,9 @@ def analyze_unieval_results(results_path: str, save_path: str) -> None:
frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv"))
for metric in tqdm.tqdm(
frame_per_metric.keys(),
desc=f"UniEval metrics: ",
total=len(frame_per_metric.keys()),
frame_per_metric.keys(),
desc=f"UniEval metrics: ",
total=len(frame_per_metric.keys()),
):
data = pd.DataFrame(frame_per_metric[metric])
......
import io
import json
import os
import re
import string
from typing import Dict
......@@ -55,7 +54,7 @@ def jload(f, mode="r"):
def get_json_list(file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
json_list = []
for line in f:
json_list.append(json.loads(line))
......@@ -187,9 +186,9 @@ def analyze_automatic_results(results_path: str, save_path: str) -> None:
frame_all.to_csv(os.path.join(save_path, "automatic_evaluation_statistics.csv"))
for metric in tqdm.tqdm(
frame_per_metric.keys(),
desc=f"automatic metrics: ",
total=len(frame_per_metric.keys()),
frame_per_metric.keys(),
desc=f"automatic metrics: ",
total=len(frame_per_metric.keys()),
):
data = pd.DataFrame(frame_per_metric[metric])
......
......@@ -3,7 +3,6 @@ import json
from typing import Dict, Sequence
import torch
from datasets import load_dataset
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoTokenizer
......@@ -20,7 +19,8 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: AutoTokenizer, max_length: i
padding="longest",
max_length=max_length,
truncation=True,
) for text in strings
)
for text in strings
]
input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
input_ids_lens = labels_lens = [
......@@ -48,18 +48,17 @@ def preprocess(sources: Sequence[str], targets: Sequence[str], tokenizer: AutoTo
class EasySupervisedDataset(Dataset):
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None:
super(EasySupervisedDataset, self).__init__()
with open(data_file, "r", encoding="UTF-8") as f:
all_lines = f.readlines()
#split to source and target ,source the characters before "回答:" including "回答:", target the characters after "回答:"
# split to source and target ,source the characters before "回答:" including "回答:", target the characters after "回答:"
sources, targets = [], []
for line in all_lines:
if "回答:" in line:
sep_index = line.index("回答:")
sources.append(line[:sep_index + 3])
targets.append(line[sep_index + 3:] + tokenizer.eos_token)
sources.append(line[: sep_index + 3])
targets.append(line[sep_index + 3 :] + tokenizer.eos_token)
else:
sources.append(line)
targets.append("" + tokenizer.eos_token)
......@@ -83,15 +82,17 @@ class EasySupervisedDataset(Dataset):
class EasyPromptsDataset(Dataset):
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None:
super(EasyPromptsDataset, self).__init__()
with open(data_file, "r", encoding="UTF-8") as f:
all_lines = f.readlines()
all_lines = [line if "回答:" not in line else line[:line.index("回答:") + 3] for line in all_lines]
all_lines = [line if "回答:" not in line else line[: line.index("回答:") + 3] for line in all_lines]
self.prompts = [
tokenizer(line, return_tensors='pt', max_length=max_length, padding='max_length',
truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
tokenizer(line, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True)[
"input_ids"
]
.to(torch.cuda.current_device())
.squeeze(0)
for line in tqdm(all_lines)
]
self.data_file = data_file
......@@ -110,7 +111,6 @@ class EasyPromptsDataset(Dataset):
class EasyRewardDataset(Dataset):
def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
super(EasyRewardDataset, self).__init__()
self.chosen = []
......@@ -120,44 +120,42 @@ class EasyRewardDataset(Dataset):
else:
self.end_token = special_token
print(self.end_token)
#read all lines in the train_file to a list
# read all lines in the train_file to a list
with open(train_file, "r", encoding="UTF-8") as f:
all_lines = f.readlines()
for line in tqdm(all_lines):
data = json.loads(line)
prompt = "提问:" + data['prompt'] + " 回答:"
chosen = prompt + data['chosen'] + self.end_token
chosen_token = tokenizer(chosen,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt")
self.chosen.append({
"input_ids": chosen_token['input_ids'],
"attention_mask": chosen_token['attention_mask']
})
reject = prompt + data['rejected'] + self.end_token
reject_token = tokenizer(reject,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt")
self.reject.append({
"input_ids": reject_token['input_ids'],
"attention_mask": reject_token['attention_mask']
})
prompt = "提问:" + data["prompt"] + " 回答:"
chosen = prompt + data["chosen"] + self.end_token
chosen_token = tokenizer(
chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
)
self.chosen.append(
{"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
)
reject = prompt + data["rejected"] + self.end_token
reject_token = tokenizer(
reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
)
self.reject.append(
{"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
)
def __len__(self):
length = len(self.chosen)
return length
def __getitem__(self, idx):
return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
"input_ids"], self.reject[idx]["attention_mask"]
#python representation of the object and the string representation of the object
return (
self.chosen[idx]["input_ids"],
self.chosen[idx]["attention_mask"],
self.reject[idx]["input_ids"],
self.reject[idx]["attention_mask"],
)
# python representation of the object and the string representation of the object
def __repr__(self):
return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
......@@ -165,26 +163,25 @@ class EasyRewardDataset(Dataset):
return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
'''
"""
Easy SFT just accept a text file which can be read line by line. However the datasets will group texts together to max_length so LLM will learn the texts meaning better.
If individual lines are not related, just set is_group_texts to False.
'''
"""
class EasySFTDataset(Dataset):
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
super().__init__()
#read the data_file line by line
# read the data_file line by line
with open(data_file, "r", encoding="UTF-8") as f:
#encode the text data line by line and put raw python list input_ids only to raw_input_ids list
# encode the text data line by line and put raw python list input_ids only to raw_input_ids list
raw_input_ids = []
for line in f:
encoded_ids = tokenizer.encode(line)
#if the encoded_ids is longer than max_length, then split it into several parts
# if the encoded_ids is longer than max_length, then split it into several parts
if len(encoded_ids) > max_length:
for i in range(0, len(encoded_ids), max_length):
raw_input_ids.append(encoded_ids[i:i + max_length])
raw_input_ids.append(encoded_ids[i : i + max_length])
else:
raw_input_ids.append(encoded_ids)
......@@ -196,12 +193,13 @@ class EasySFTDataset(Dataset):
if is_group_texts:
for input_ids in raw_input_ids:
if len(current_input_ids) + len(input_ids) > max_length:
#pad the current_input_ids to max_length with tokenizer.pad_token_id
# pad the current_input_ids to max_length with tokenizer.pad_token_id
padded_length = max_length - len(current_input_ids)
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
)
current_input_ids = []
else:
current_input_ids.extend(input_ids)
......@@ -210,14 +208,16 @@ class EasySFTDataset(Dataset):
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
)
else:
#just append the raw_input_ids to max_length
# just append the raw_input_ids to max_length
for input_ids in raw_input_ids:
padded_length = max_length - len(input_ids)
input_ids.extend([tokenizer.pad_token_id] * padded_length)
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
)
grouped_input_ids.append(torch.tensor(input_ids, dtype=torch.long))
self.input_ids = grouped_input_ids
self.labels = copy.deepcopy(self.input_ids)
......@@ -227,14 +227,14 @@ class EasySFTDataset(Dataset):
def __len__(self):
return len(self.input_ids)
#get item from dataset
# get item from dataset
def __getitem__(self, idx):
return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
#generate the dataset description to be printed by print in python
# generate the dataset description to be printed by print in python
def __repr__(self):
return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
#generate the dataset description to be printed by print in python
# generate the dataset description to be printed by print in python
def __str__(self):
return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
......@@ -4,7 +4,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from coati.models.generation import generate
from coati.models.utils import log_probs_from_logits, masked_mean
from coati.models.utils import log_probs_from_logits
from peft import PeftModel
from torch.nn.modules import Module
from transformers import BloomConfig, BloomForCausalLM
......@@ -24,38 +24,33 @@ class Actor(Module):
@torch.no_grad()
def generate(
self,
input_ids: torch.Tensor,
return_action_mask: bool = True,
**kwargs
self, input_ids: torch.Tensor, return_action_mask: bool = True, **kwargs
) -> Union[Tuple[torch.LongTensor, torch.LongTensor], Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]:
sequences = generate(self.model, input_ids, **kwargs)
attention_mask = None
pad_token_id = kwargs.get('pad_token_id', None)
pad_token_id = kwargs.get("pad_token_id", None)
if pad_token_id is not None:
attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
if not return_action_mask:
return sequences, attention_mask, None
input_len = input_ids.size(1)
eos_token_id = kwargs.get('eos_token_id', None)
eos_token_id = kwargs.get("eos_token_id", None)
if eos_token_id is None:
action_mask = torch.ones_like(sequences, dtype=torch.bool)
else:
# left padding may be applied, only mask action
action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input
action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input
action_mask[:, :input_len] = False
action_mask = action_mask[:, 1:]
return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len):]
return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len) :]
def forward(self,
sequences: torch.LongTensor,
num_actions: int,
attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
"""Returns action log probs
"""
def forward(
self, sequences: torch.LongTensor, num_actions: int, attention_mask: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""Returns action log probs"""
output = self.model(sequences, attention_mask=attention_mask)
logits = output['logits']
logits = output["logits"]
log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
return log_probs[:, -num_actions:]
......@@ -75,11 +70,13 @@ class BLOOMActor(Actor):
lora_train_bias (str): LoRA bias training mode.
"""
def __init__(self,
pretrained: str = None,
config: Optional[BloomConfig] = None,
checkpoint: bool = False,
lora_path: str = None) -> None:
def __init__(
self,
pretrained: str = None,
config: Optional[BloomConfig] = None,
checkpoint: bool = False,
lora_path: str = None,
) -> None:
if pretrained is not None:
model = BloomForCausalLM.from_pretrained(pretrained)
elif config is not None:
......
import argparse
import pandas as pd
import torch
import torch.distributed as dist
from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
from coati.dataset import DataCollatorForSupervisedDataset
from coati.models.bloom import BLOOMRM, BLOOMCritic
from coati.models.gpt import GPTRM, GPTActor, GPTCritic
from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
from coati.models.opt import OPTRM, OPTActor, OPTCritic
from coati.models.gpt import GPTRM, GPTCritic
from coati.models.llama import LlamaCritic, LlamaRM
from coati.models.opt import OPTRM, OPTCritic
from coati.trainer import PPOTrainer
from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
from easy_models import BLOOMActor
from peft import PeftModel
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
......@@ -23,24 +21,24 @@ from colossalai.nn.optimizer import HybridAdam
def main(args):
# configure strategy
if args.strategy == 'ddp':
if args.strategy == "ddp":
strategy = DDPStrategy()
elif args.strategy == 'colossalai_gemini':
strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
elif args.strategy == 'colossalai_zero2':
strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
elif args.strategy == "colossalai_gemini":
strategy = GeminiStrategy(placement_policy="cpu", initial_scale=2**5)
elif args.strategy == "colossalai_zero2":
strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
if args.rm_path is not None:
state_dict = torch.load(args.rm_path, map_location='cpu')
state_dict = torch.load(args.rm_path, map_location="cpu")
# configure model
if args.model == 'bloom':
if args.model == "bloom":
# initial_model = BLOOMActor(pretrained=args.pretrain)
print('Using peft lora to load Bloom model as initial_model')
print("Using peft lora to load Bloom model as initial_model")
initial_model = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
print('Using peft lora to load Bloom model as initial_model (Done)')
print("Using peft lora to load Bloom model as initial_model (Done)")
else:
raise ValueError(f'Unsupported actor model "{args.model}"')
......@@ -49,59 +47,59 @@ def main(args):
else:
rm_model_name = args.rm_model
if rm_model_name == 'gpt2':
if rm_model_name == "gpt2":
reward_model = GPTRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'bloom':
elif rm_model_name == "bloom":
print("load bloom reward model ", args.rm_pretrain)
reward_model = BLOOMRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'opt':
elif rm_model_name == "opt":
reward_model = OPTRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'llama':
elif rm_model_name == "llama":
reward_model = LlamaRM(pretrained=args.rm_pretrain)
else:
raise ValueError(f'Unsupported reward model "{rm_model_name}"')
if args.rm_path is not None:
print('Loading reward model from', args.rm_path)
print("Loading reward model from", args.rm_path)
reward_model.load_state_dict(state_dict)
if args.strategy != 'colossalai_gemini':
if args.strategy != "colossalai_gemini":
initial_model.to(torch.float16).to(torch.cuda.current_device())
reward_model.to(torch.float16).to(torch.cuda.current_device())
with strategy.model_init_context():
if args.model == 'bloom':
if args.model == "bloom":
# actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
print('Using peft lora to load Bloom model as Actor')
print("Using peft lora to load Bloom model as Actor")
actor = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
print('Using peft lora to load Bloom model as Actor (Done)')
print("Using peft lora to load Bloom model as Actor (Done)")
else:
raise ValueError(f'Unsupported actor model "{args.model}"')
if rm_model_name == 'gpt2':
if rm_model_name == "gpt2":
critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
elif rm_model_name == 'bloom':
elif rm_model_name == "bloom":
print("load bloom critic ", args.rm_pretrain, " lora_rank ", args.lora_rank, " use_action_mask ", True)
critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
print("load bloom critic (Done) ")
elif rm_model_name == 'opt':
elif rm_model_name == "opt":
critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
elif rm_model_name == 'llama':
elif rm_model_name == "llama":
critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
else:
raise ValueError(f'Unsupported reward model "{rm_model_name}"')
if args.rm_path is not None:
print('Loading reward model from', args.rm_path)
print("Loading reward model from", args.rm_path)
critic.load_state_dict(state_dict)
del state_dict
if args.strategy != 'colossalai_gemini':
if args.strategy != "colossalai_gemini":
critic.to(torch.float16).to(torch.cuda.current_device())
actor.to(torch.float16).to(torch.cuda.current_device())
# configure optimizer
if args.strategy.startswith('colossalai'):
if args.strategy.startswith("colossalai"):
actor_optim = HybridAdam(actor.parameters(), lr=1e-7)
critic_optim = HybridAdam(critic.parameters(), lr=1e-7)
else:
......@@ -109,18 +107,18 @@ def main(args):
critic_optim = Adam(critic.parameters(), lr=1e-7)
# configure tokenizer
if args.model == 'gpt2':
if args.model == "gpt2":
tokenizer = GPT2Tokenizer.from_pretrained(args.rm_pretrain)
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'bloom':
elif args.model == "bloom":
tokenizer = BloomTokenizerFast.from_pretrained(args.rm_pretrain)
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'opt':
elif args.model == "opt":
tokenizer = AutoTokenizer.from_pretrained(args.rm_pretrain)
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'llama':
elif args.model == "llama":
tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
tokenizer.eos_token = '<\s>'
tokenizer.eos_token = "<\s>"
tokenizer.pad_token = tokenizer.unk_token
else:
raise ValueError(f'Unsupported model "{args.model}"')
......@@ -132,26 +130,27 @@ def main(args):
prompt_sampler = DistributedSampler(prompt_dataset, shuffle=True, seed=42, drop_last=True)
else:
prompt_sampler = None
prompt_dataloader = DataLoader(prompt_dataset,
shuffle=(prompt_sampler is None),
sampler=prompt_sampler,
batch_size=args.train_batch_size)
prompt_dataloader = DataLoader(
prompt_dataset, shuffle=(prompt_sampler is None), sampler=prompt_sampler, batch_size=args.train_batch_size
)
pretrain_dataset = EasySupervisedDataset(args.pretrain_dataset, tokenizer)
if dist.is_initialized() and dist.get_world_size() > 1:
pretrain_sampler = DistributedSampler(pretrain_dataset, shuffle=True, seed=42, drop_last=True)
else:
pretrain_sampler = None
pretrain_dataloader = DataLoader(pretrain_dataset,
shuffle=(pretrain_sampler is None),
sampler=pretrain_sampler,
batch_size=args.ptx_batch_size,
collate_fn=data_collator)
pretrain_dataloader = DataLoader(
pretrain_dataset,
shuffle=(pretrain_sampler is None),
sampler=pretrain_sampler,
batch_size=args.ptx_batch_size,
collate_fn=data_collator,
)
def tokenize_fn(texts):
# MUST padding to max length to ensure inputs of all ranks have the same length
# Different length may lead to hang when using gemini, as different generation steps
batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
batch = tokenizer(texts, return_tensors="pt", max_length=96, padding="max_length", truncation=True)
return {k: v.to(torch.cuda.current_device()) for k, v in batch.items()}
(actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
......@@ -178,45 +177,46 @@ def main(args):
eos_token_id=tokenizer.eos_token_id,
)
trainer.fit(prompt_dataloader=prompt_dataloader,
pretrain_dataloader=pretrain_dataloader,
num_episodes=args.num_episodes,
num_update_steps=args.num_update_steps,
num_collect_steps=args.num_collect_steps)
trainer.fit(
prompt_dataloader=prompt_dataloader,
pretrain_dataloader=pretrain_dataloader,
num_episodes=args.num_episodes,
num_update_steps=args.num_update_steps,
num_collect_steps=args.num_collect_steps,
)
# save model checkpoint after fitting
trainer.save_model(args.save_path, only_rank0=True, tokenizer=tokenizer)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
strategy.save_optimizer(actor_optim,
'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
only_rank0=False)
strategy.save_optimizer(
actor_optim, "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()), only_rank0=False
)
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--prompt_path', type=str, default=None, help='path to the prompt dataset')
parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset')
parser.add_argument('--strategy',
choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
default='ddp',
help='strategy to use')
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--pretrain', type=str, default=None)
parser.add_argument('--sft_lora_path', type=str, default=None)
parser.add_argument('--rm_model', default=None, choices=['gpt2', 'bloom', 'opt', 'llama'])
parser.add_argument('--rm_path', type=str, default=None)
parser.add_argument('--rm_pretrain', type=str, default=None)
parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts')
parser.add_argument('--need_optim_ckpt', type=bool, default=False)
parser.add_argument('--num_episodes', type=int, default=10)
parser.add_argument('--num_collect_steps', type=int, default=10)
parser.add_argument('--num_update_steps', type=int, default=5)
parser.add_argument('--train_batch_size', type=int, default=2)
parser.add_argument('--ptx_batch_size', type=int, default=1)
parser.add_argument('--experience_batch_size', type=int, default=8)
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--kl_coef', type=float, default=0.1)
parser.add_argument('--ptx_coef', type=float, default=0.9)
parser.add_argument("--prompt_path", type=str, default=None, help="path to the prompt dataset")
parser.add_argument("--pretrain_dataset", type=str, default=None, help="path to the pretrained dataset")
parser.add_argument(
"--strategy", choices=["ddp", "colossalai_gemini", "colossalai_zero2"], default="ddp", help="strategy to use"
)
parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
parser.add_argument("--pretrain", type=str, default=None)
parser.add_argument("--sft_lora_path", type=str, default=None)
parser.add_argument("--rm_model", default=None, choices=["gpt2", "bloom", "opt", "llama"])
parser.add_argument("--rm_path", type=str, default=None)
parser.add_argument("--rm_pretrain", type=str, default=None)
parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
parser.add_argument("--need_optim_ckpt", type=bool, default=False)
parser.add_argument("--num_episodes", type=int, default=10)
parser.add_argument("--num_collect_steps", type=int, default=10)
parser.add_argument("--num_update_steps", type=int, default=5)
parser.add_argument("--train_batch_size", type=int, default=2)
parser.add_argument("--ptx_batch_size", type=int, default=1)
parser.add_argument("--experience_batch_size", type=int, default=8)
parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument("--kl_coef", type=float, default=0.1)
parser.add_argument("--ptx_coef", type=float, default=0.9)
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment