Unverified Commit 079bf3cb authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
parent 3c6b831c
......@@ -39,8 +39,9 @@ class SFTTrainer(SLTrainer):
accumulation_steps: int = 8,
) -> None:
if accumulation_steps > 1:
assert not isinstance(strategy, GeminiStrategy), \
"Accumulation steps are not supported in stage 3 of ColossalAI"
assert not isinstance(
strategy, GeminiStrategy
), "Accumulation steps are not supported in stage 3 of ColossalAI"
super().__init__(strategy, max_epochs, model, optim)
......@@ -50,15 +51,11 @@ class SFTTrainer(SLTrainer):
def _train(self, epoch: int):
self.model.train()
for batch_id, batch in enumerate(self.train_dataloader):
batch = to_device(batch, torch.cuda.current_device())
if "attention_mask" in batch:
outputs = self.model(batch["input_ids"],
attention_mask=batch["attention_mask"],
labels=batch["labels"])
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
else:
outputs = self.model(batch["input_ids"],
labels=batch["labels"])
outputs = self.model(batch["input_ids"], labels=batch["labels"])
loss = outputs.loss
loss = loss / self.accumulation_steps
......@@ -73,12 +70,14 @@ class SFTTrainer(SLTrainer):
self.optimizer.zero_grad()
self.scheduler.step()
if is_rank_0() and self.use_wandb:
wandb.log({
wandb.log(
{
"loss": self.total_loss / self.accumulation_steps,
"lr": self.scheduler.get_last_lr()[0],
"epoch": epoch,
"batch_id": batch_id
})
"batch_id": batch_id,
}
)
self.total_loss = 0
self.step_bar.update()
......@@ -89,9 +88,9 @@ class SFTTrainer(SLTrainer):
loss_sum, num_seen = 0, 0
for batch in self.eval_dataloader:
batch = to_device(batch, torch.cuda.current_device())
outputs = self.model(batch["input_ids"],
attention_mask=batch["attention_mask"],
labels=batch["labels"])
outputs = self.model(
batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"]
)
loss = outputs.loss
loss_sum += loss.item()
......@@ -99,13 +98,15 @@ class SFTTrainer(SLTrainer):
loss_mean = loss_sum / num_seen
if dist.get_rank() == 0:
self.logger.info(f'Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}')
self.logger.info(f"Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}")
def _before_fit(self,
def _before_fit(
self,
train_dataloader: DataLoader,
eval_dataloader: Optional[DataLoader] = None,
logger: Optional[DistributedLogger] = None,
use_wandb: bool = False):
use_wandb: bool = False,
):
"""
Args:
train_dataloader: the dataloader to use for training
......@@ -124,6 +125,6 @@ class SFTTrainer(SLTrainer):
self.no_epoch_bar = True
self.step_bar = tqdm.trange(
len(self.train_dataloader) // self.accumulation_steps * self.max_epochs,
desc=f'steps',
disable=not is_rank_0()
desc=f"steps",
disable=not is_rank_0(),
)
......@@ -2,7 +2,4 @@ from .base import Strategy
from .colossalai import GeminiStrategy, LowLevelZeroStrategy
from .ddp import DDPStrategy
__all__ = [
'Strategy', 'DDPStrategy',
'LowLevelZeroStrategy', 'GeminiStrategy'
]
__all__ = ["Strategy", "DDPStrategy", "LowLevelZeroStrategy", "GeminiStrategy"]
......@@ -83,16 +83,18 @@ class Strategy(ABC):
rets.append((model, optimizer))
elif isinstance(arg, Dict):
model, optimizer, criterion, dataloader, lr_scheduler = self.booster.boost(**arg)
boost_result = dict(model=model,
boost_result = dict(
model=model,
optimizer=optimizer,
criterion=criterion,
dataloader=dataloader,
lr_scheduler=lr_scheduler)
lr_scheduler=lr_scheduler,
)
# remove None values
boost_result = {key: value for key, value in boost_result.items() if value is not None}
rets.append(boost_result)
else:
raise RuntimeError(f'Type {type(arg)} is not supported')
raise RuntimeError(f"Type {type(arg)} is not supported")
return rets[0] if len(rets) == 1 else rets
......@@ -125,11 +127,9 @@ class Strategy(ABC):
return DistributedSampler(dataset, 1, 0)
@abstractmethod
def save_pretrained(self,
model: nn.Module,
path: str,
only_rank0: bool = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
def save_pretrained(
self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
) -> None:
pass
@abstractmethod
......
......@@ -42,11 +42,12 @@ class LowLevelZeroStrategy(DDPStrategy):
"""
def __init__(self,
def __init__(
self,
stage: int = 2,
precision: str = 'fp16',
precision: str = "fp16",
seed: int = 42,
placement_policy: str = 'cuda',
placement_policy: str = "cuda",
reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2
overlap_communication: bool = True, # only for stage 1&2
initial_scale: float = 2**16,
......@@ -57,12 +58,11 @@ class LowLevelZeroStrategy(DDPStrategy):
min_scale: float = 1,
max_scale: float = 2**32,
max_norm: float = 0.0,
norm_type: float = 2.0
norm_type: float = 2.0,
) -> None:
assert stage in (1, 2), f'Unsupported stage "{stage}"'
assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
assert precision in ("fp32", "fp16"), f'Unsupported precision "{precision}"'
plugin_initializer = lambda: LowLevelZeroPlugin(
# zero_config
......@@ -71,7 +71,7 @@ class LowLevelZeroStrategy(DDPStrategy):
# zero_optim_config
reduce_bucket_size_in_m=reduce_bucket_size,
overlap_communication=overlap_communication,
cpu_offload=(placement_policy == 'cpu'),
cpu_offload=(placement_policy == "cpu"),
# optim_config
initial_scale=initial_scale,
growth_factor=growth_factor,
......@@ -81,14 +81,15 @@ class LowLevelZeroStrategy(DDPStrategy):
min_scale=min_scale,
max_scale=max_scale,
max_norm=max_norm,
norm_type=norm_type
norm_type=norm_type,
)
super().__init__(seed, plugin_initializer)
def _post_init(self) -> None:
assert isinstance(self.plugin, LowLevelZeroPlugin), \
f'{type(self).__name__}\'s plugin is not initialized properly.'
assert isinstance(
self.plugin, LowLevelZeroPlugin
), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
colossalai.launch_from_torch({}, seed=self.seed)
......@@ -131,10 +132,11 @@ class GeminiStrategy(DDPStrategy):
"""
def __init__(self,
def __init__(
self,
seed: int = 42,
shard_init: bool = False, # only for stage 3
placement_policy: str = 'cuda',
placement_policy: str = "cuda",
pin_memory: bool = True, # only for stage 3
force_outputs_fp32: bool = False, # only for stage 3
search_range_m: int = 32, # only for stage 3
......@@ -149,27 +151,26 @@ class GeminiStrategy(DDPStrategy):
min_scale: float = 1,
max_scale: float = 2**32,
max_norm: float = 0.0,
norm_type: float = 2.0
norm_type: float = 2.0,
) -> None:
assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
# TODO(ver217): support shard_init when using from_pretrained()
if shard_init:
warnings.warn(
f'Shard init is not supported model.from_pretrained() yet. '
'Please load weights after strategy.prepare()'
f"Shard init is not supported model.from_pretrained() yet. "
"Please load weights after strategy.prepare()"
)
self.shard_init = shard_init
warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")
# NOTE: dist should be initialized before calling get_current_device()
plugin_initializer = lambda: GeminiPlugin(
# gemini_config
device=get_current_device(),
placement_policy=placement_policy,
precision='fp16',
precision="fp16",
pin_memory=pin_memory,
force_outputs_fp32=force_outputs_fp32,
strict_ddp_mode=shard_init,
......@@ -187,14 +188,13 @@ class GeminiStrategy(DDPStrategy):
min_scale=min_scale,
max_scale=max_scale,
max_norm=max_norm,
norm_type=norm_type
norm_type=norm_type,
)
super().__init__(seed, plugin_initializer)
def _post_init(self) -> None:
assert isinstance(self.plugin, GeminiPlugin), \
f'{type(self).__name__}\'s plugin is not initialized properly.'
assert isinstance(self.plugin, GeminiPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
colossalai.launch_from_torch({}, seed=self.seed)
......@@ -203,10 +203,9 @@ class GeminiStrategy(DDPStrategy):
world_size = dist.get_world_size()
shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
return ColoInitContext(device=get_current_device(),
dtype=torch.half,
default_pg=shard_pg,
default_dist_spec=default_dist_spec)
return ColoInitContext(
device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec
)
def unwrap_model(self, model: nn.Module) -> nn.Module:
assert isinstance(model, GeminiModel)
......
......@@ -34,21 +34,18 @@ class DDPStrategy(Strategy):
Strategy for distributed training using torch.distributed.
"""
def __init__(self,
seed: int = 42,
plugin_initializer: Callable = TorchDDPPlugin
) -> None:
def __init__(self, seed: int = 42, plugin_initializer: Callable = TorchDDPPlugin) -> None:
self.seed = seed
super().__init__(plugin_initializer)
def _try_init_dist(self, force: bool = False) -> None:
try:
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
host = os.environ['MASTER_ADDR']
port = int(os.environ['MASTER_PORT'])
dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
host = os.environ["MASTER_ADDR"]
port = int(os.environ["MASTER_PORT"])
dist.init_process_group("nccl", init_method=f"tcp://[{host}]:{port}", world_size=world_size, rank=rank)
torch.cuda.set_device(local_rank)
except KeyError as e:
if force:
......@@ -60,8 +57,7 @@ class DDPStrategy(Strategy):
raise e
def _post_init(self) -> None:
assert isinstance(self.plugin, TorchDDPPlugin), \
f'{type(self).__name__}\'s plugin is not initialized properly.'
assert isinstance(self.plugin, TorchDDPPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
self._try_init_dist(force=True)
......@@ -73,12 +69,14 @@ class DDPStrategy(Strategy):
torch.manual_seed(seed)
def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = False) -> DataLoader:
return self.plugin.prepare_dataloader(data_buffer,
return self.plugin.prepare_dataloader(
data_buffer,
batch_size=data_buffer.sample_batch_size,
shuffle=True,
drop_last=True,
pin_memory=pin_memory,
collate_fn=data_buffer.collate_fn)
collate_fn=data_buffer.collate_fn,
)
def setup_sampler(self, dataset) -> DistributedSampler:
# FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
......@@ -88,11 +86,9 @@ class DDPStrategy(Strategy):
assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel."
return model.unwrap()
def save_pretrained(self,
model: nn.Module,
path: str,
only_rank0: bool = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
def save_pretrained(
self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
) -> None:
if not only_rank0 or dist.get_rank() == 0:
unwrapped_model = self.unwrap_model(model)
assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
......@@ -103,17 +99,11 @@ class DDPStrategy(Strategy):
if tokenizer is not None:
tokenizer.save_pretrained(path)
model_path = os.path.join(path, "pytorch_model.bin")
self.save_model(model,
model_path,
only_rank0=only_rank0)
self.save_model(model, model_path, only_rank0=only_rank0)
def _replace_keys(model_path: str,
replace_fn: Callable):
def _replace_keys(model_path: str, replace_fn: Callable):
state_dict = torch.load(model_path, map_location="cpu")
state_dict = {
replace_fn(k): v
for k, v in state_dict.items()
}
state_dict = {replace_fn(k): v for k, v in state_dict.items()}
torch.save(state_dict, model_path)
# FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
......@@ -124,13 +114,13 @@ class DDPStrategy(Strategy):
def get_model_state_dict_shard(self, model: nn.Module, **config):
# TODO: implement sharding on naive strategy
model = self.unwrap_model(model)
if 'requires_grad_only' in config and config['requires_grad_only'] == True:
if "requires_grad_only" in config and config["requires_grad_only"] == True:
state_dict = get_grad_required_state_dict(model)
else:
state_dict = model.state_dict()
if 'shard_size' in config:
shard_size = config['shard_size']
if "shard_size" in config:
shard_size = config["shard_size"]
accumulate_size = 0
state_dict_shard = OrderedDict()
for name, param in state_dict.items():
......
......@@ -4,7 +4,6 @@ import numpy as np
class DistributedSampler:
def __init__(self, dataset, num_replicas: int, rank: int) -> None:
self.dataset = dataset
self.num_replicas = num_replicas
......@@ -20,10 +19,10 @@ class DistributedSampler:
self.total_size = self.num_samples * self.num_replicas
indices = list(range(len(self.dataset)))
indices = indices[:self.total_size]
indices = indices[: self.total_size]
assert len(indices) == self.total_size
# subsample
indices = indices[self.rank:self.total_size:self.num_replicas]
indices = indices[self.rank : self.total_size : self.num_replicas]
assert len(indices) == self.num_samples
self.indices = indices
......
......@@ -42,7 +42,6 @@ def is_rank_0() -> bool:
def to_device(x: Any, device: torch.device) -> Any:
def _to(t: Any):
if isinstance(t, torch.Tensor):
return t.to(device)
......
import argparse
import json
import os
import openai
......@@ -9,7 +8,8 @@ from utils import jload
def main(args):
assert len(args.answer_file_list) == len(
args.model_name_list), "The number of answer files and model names should be equal!"
args.model_name_list
), "The number of answer files and model names should be equal!"
# load config
config = jload(args.config_file)
......@@ -36,7 +36,8 @@ def main(args):
if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
raise Exception(
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
)
if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
raise Exception(
......@@ -44,8 +45,15 @@ def main(args):
)
# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"], config.get("path_for_UniEval", None), args.gpt_with_reference)
evaluator = Evaluator(
metrics_per_category,
battle_prompt,
gpt_evaluation_prompt,
args.gpt_model,
config["language"],
config.get("path_for_UniEval", None),
args.gpt_with_reference,
)
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])
......@@ -68,41 +76,41 @@ def main(args):
raise ValueError(f'Unsupported language {config["language"]}!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
parser.add_argument('--config_file',
type=str,
default=None,
required=True,
help='path to the file of target results')
parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
parser.add_argument('--gpt_evaluation_prompt_file',
type=str,
default=None,
help='path to the prompt file for gpt evaluation')
parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
parser.add_argument('--answer_file_list',
type=str,
nargs='+',
default=[],
required=True,
help='path to the answer files of at most 2 models')
parser.add_argument('--model_name_list',
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
parser.add_argument(
"--config_file", type=str, default=None, required=True, help="path to the file of target results"
)
parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
parser.add_argument(
"--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
)
parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
parser.add_argument(
"--answer_file_list",
type=str,
nargs='+',
nargs="+",
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--gpt_model',
help="path to the answer files of at most 2 models",
)
parser.add_argument(
"--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
)
parser.add_argument(
"--gpt_model",
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--gpt_with_reference',
help="which GPT model to use for evaluation",
)
parser.add_argument(
"--gpt_with_reference",
default=False,
action="store_true",
help='whether to include reference answer in gpt evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
help="whether to include reference answer in gpt evaluation",
)
parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
args = parser.parse_args()
if args.openai_key is not None:
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -7,6 +7,9 @@ from .utils import (
)
__all__ = [
'get_evaluator', 'convert_data_to_unieval_format', 'calculate_average_score', 'save_unieval_results',
'analyze_unieval_results'
"get_evaluator",
"convert_data_to_unieval_format",
"calculate_average_score",
"save_unieval_results",
"analyze_unieval_results",
]
This diff is collapsed.
import io
import json
import os
import re
import string
from typing import Dict
......@@ -55,7 +54,7 @@ def jload(f, mode="r"):
def get_json_list(file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
json_list = []
for line in f:
json_list.append(json.loads(line))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment