Commit 9fdb7dab authored by yuguo960516's avatar yuguo960516
Browse files

bloom

parents
Pipeline #150 failed with stages
in 0 seconds
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow.utils.data import Sampler
class CyclicSampler(Sampler):
"""
This sampler supports cyclic sampling, and it is also compatible with
non-data parallelism and data parallelism.
Arguments:
dataset: dataset to be sampled.
micro_batch_size: batch size for per model instance.
global_batch_size is micro_batch_size times data_parallel_size.
shuffle: whether to shuffle the dataset.
consumed_samples: the number of samples that have been trained at the current time,
used for resuming training (default: ``0``).
data_parallel_rank: local rank for data parallelism.
data_parallel_size: the size of data parallelism.
seed: random seed, used for reproducing experiments (default: ``0``).
"""
def __init__(
self,
dataset,
micro_batch_size,
shuffle=False,
consumed_samples=0,
data_parallel_rank=0,
data_parallel_size=1,
seed=0,
):
self.dataset = dataset
self.data_size = len(self.dataset)
self.shuffle = shuffle
self.data_parallel_rank = data_parallel_rank
self.data_parallel_size = data_parallel_size
self.micro_batch_size = micro_batch_size
self.actual_batch_size = self.micro_batch_size * self.data_parallel_size
self.data_size_per_epoch = self.data_size // self.actual_batch_size * self.micro_batch_size
self.consumed_samples = consumed_samples
self.seed = seed
def __iter__(self):
"""divide the data into data_parallel_size buckets,
and shuffle it if `shuffle` is set to `True`.
Each processor samples from its own buckets and data_loader
will load the corresponding data.
"""
epoch = self.consumed_samples // self.data_size_per_epoch
current_epoch_samples = self.consumed_samples % self.data_size_per_epoch
batch = []
while True:
bucket_offset = current_epoch_samples // self.data_parallel_size
start_idx = self.data_parallel_rank * self.data_size_per_epoch
if self.shuffle:
generator = flow.Generator()
generator.manual_seed(self.seed + epoch)
random_idx = flow.randperm(self.data_size_per_epoch, generator=generator).tolist()
indices = [start_idx + x for x in random_idx[bucket_offset:]]
else:
seq_idx = flow.arange(self.data_size_per_epoch).tolist()
indices = [start_idx + x for x in seq_idx[bucket_offset:]]
epoch += 1
if hasattr(self.dataset, "supports_prefetch") and self.dataset.supports_prefetch:
self.dataset.prefetch(indices)
for idx in indices:
batch.append(idx)
if len(batch) == self.micro_batch_size:
self.consumed_samples += self.actual_batch_size
yield batch
batch = []
current_epoch_samples = 0
def __len__(self):
return self.data_size
def set_consumed_samples(self, consumed_samples):
"""You can recover the training iteration by setting `consumed_samples`."""
self.consumed_samples = consumed_samples
def set_epoch(self, epoch):
"""Used for restoring training status."""
self.epoch = epoch
class SingleRoundSampler(Sampler):
"""
This sampler supports single round sampling, and it is also compatible with
non data parallelism and data parallelism.
Arguments:
dataset: dataset to be sampled.
micro_batch_size: batch size for per model instance, global_batch_size
is micro_batch_size times data_parallel_size.
shuffle: whether to shuffle the dataset.
data_parallel_rank: local rank for data parallelism.
data_parallel_size: the size of data parallelism.
seed: random seed, used for reproducing experiments (default: ``0``).
drop_last: whether to drop the remaining data (default: ``False``).
"""
def __init__(
self,
dataset,
micro_batch_size,
shuffle=False,
data_parallel_rank=0,
data_parallel_size=1,
seed=0,
drop_last=False,
):
self.dataset = dataset
self.data_size = len(self.dataset)
self.shuffle = shuffle
self.data_parallel_rank = data_parallel_rank
self.data_parallel_size = data_parallel_size
self.micro_batch_size = micro_batch_size
self.seed = seed
self.drop_last = drop_last
def __iter__(self):
bucket_size = self.data_size // self.data_parallel_size
remain = self.data_size % self.data_parallel_size
start_idx = self.data_parallel_rank * bucket_size
if self.data_parallel_rank < remain:
bucket_size += 1
start_idx += min(self.data_parallel_rank, remain)
if self.shuffle:
generator = flow.Generator()
generator.manual_seed(self.seed)
random_idx = flow.randperm(bucket_size, generator=generator).tolist()
indices = [start_idx + x for x in random_idx]
else:
seq_idx = flow.arange(bucket_size).tolist()
indices = [start_idx + x for x in seq_idx]
if hasattr(self.dataset, "supports_prefetch") and self.dataset.supports_prefetch:
self.dataset.prefetch(indices)
batch = []
for idx in indices:
batch.append(idx)
if len(batch) == self.micro_batch_size:
yield batch
batch = []
if not self.drop_last:
if self.data_parallel_rank >= remain and remain > 0:
batch.append(0)
if len(batch) > 0:
yield batch
def __len__(self):
global_batch_size = self.micro_batch_size * self.data_parallel_size
if self.drop_last:
return self.data_size // global_batch_size
else:
return (self.data_size + global_batch_size - 1) // global_batch_size
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
from dataclasses import dataclass, field
from typing import Any, List
import oneflow as flow
from libai.utils import distributed as dist
@dataclass
class DistTensorData:
tensor: flow.Tensor
sbp_list: list = field(default_factory=lambda: ["split_0", "broadcast"])
placement_idx: int = 0
# Tensor-like methods
def to_global(self, sbp=None, placement=None, device_type="cuda"):
if sbp is not None:
self.sbp = sbp
else:
sbp_list = []
for sbp in self.sbp_list:
sbp = sbp.split("_")
if len(sbp) > 1:
# split dim
assert sbp[0] == "split"
split_dim = int(sbp[1])
sbp_list.append(flow.sbp.split(split_dim))
else:
sbp_sign = sbp[0]
sbp_list.append(getattr(flow.sbp, sbp_sign))
self.sbp = dist.get_nd_sbp(sbp_list)
if placement is not None:
self.tensor = self.tensor.to_global(sbp=self.sbp, placement=placement)
else:
# Convert local tensor to global tensor with default setting,
# if the placement parameter is not provided.
# When enable pipeline parallel training,
# all the devices will be grouped into several device groups
# and the model will be split into several stages.
# Each stage will be placed on the corresponding device group.
# For those tensors to be used in the last stage,
# we first convert them to global tensor by only retain those on the device group 0,
# then transfer the result to the last stage.
# We do that to make sure that all the tensors used by the model are all generated
# by the fist device group, in case that each device group containg
# some random augmentations to the tensors without setting the same global seed.
main_placement = dist.get_layer_placement(0, device_type)
self.tensor = self.tensor.to_global(sbp=self.sbp, placement=main_placement)
if self.placement_idx != 0:
self.tensor = self.tensor.to_global(
placement=dist.get_layer_placement(self.placement_idx, device_type)
)
@staticmethod
def stack(distTensor_lists: List["DistTensorData"]) -> "DistTensorData":
if not isinstance(distTensor_lists[0].tensor, flow.Tensor):
raise TypeError(
"DistTensorData.tensor must be a flow.Tensor, but got {}. "
"Please check the return values of `__getitem__` in dataset.".format(
type(distTensor_lists[0].tensor)
)
)
assert len(distTensor_lists) > 0
if len(distTensor_lists) == 1:
# TODO(l1aoxingyu): add inplace unsqueeze
# distTensor_lists[0].tensor.unsqueeze_(0) # add batch dim
distTensor_lists[0].tensor = distTensor_lists[0].tensor.unsqueeze(0) # add batch dim
return distTensor_lists[0]
tensor_size = distTensor_lists[0].tensor.size()
sbp_list = distTensor_lists[0].sbp_list
placement_idx = distTensor_lists[0].placement_idx
tensors = []
for data in distTensor_lists:
assert (
data.tensor.size() == tensor_size
), f"tensor shape is not equal, {data.tensor.size()} != {tensor_size}"
assert (
data.sbp_list == sbp_list
), f"sbp_list is not equal, {data.sbp_list} != {sbp_list}!"
assert (
data.placement_idx == placement_idx
), f"placement_idx is not equal, {data.placement_idx} != {placement_idx}"
tensors.append(data.tensor)
tensors = flow.stack(tensors, dim=0)
ret = DistTensorData(tensors, sbp_list=sbp_list, placement_idx=placement_idx)
return ret
class Instance:
"""
This class represents a instance with metadata as attributes.
It stores the attributes of an instance (e.g., image, tokens) as "fields".
all other (non-filed) attributes of this class are considered private:
they must start with '_' and are not modifiable by a user.
Some basic usage:
1. Set/get/check a field:
.. code-block:: python
instance.tokens = Metadata(...)
instance.mask = Metadata(...)
print(instance.tokens)
print(instance.has("mask")) # True
2. ``len(instance)`` returns the number of instance
"""
def __init__(self, **kwargs):
self._fields = OrderedDict()
for k, v in kwargs.items():
self.set(k, v)
def __setattr__(self, name: str, val: Any) -> None:
if name.startswith("_"):
super().__setattr__(name, val)
else:
self.set(name, val)
def __getattr__(self, name: str):
if name == "_fields" or name not in self._fields:
raise AttributeError(f"Cannot find field '{name}' in the given Instance!")
return self._fields[name]
def set(self, name: str, value: Any):
"""
Set the field named `name` to `value`.
"""
self._fields[name] = value
def has(self, name: str):
return name in self._fields
def remove(self, name: str):
del self._fields[name]
def get(self, name: str):
return self._fields[name]
def get_fields(self):
return self._fields
def __len__(self):
return len(self._fields.keys())
def __iter__(self):
raise NotImplementedError("`Instances` object is not iterable!")
@staticmethod
def stack(instance_lists: List["Instance"]) -> "Instance":
assert all(isinstance(i, Instance) for i in instance_lists)
assert len(instance_lists) > 0
ret = Instance()
for k in instance_lists[0]._fields.keys():
values = [i.get(k) for i in instance_lists]
v0 = values[0]
if isinstance(v0, flow.Tensor):
values = flow.stack(values, dim=0)
elif isinstance(v0, list):
pass
elif hasattr(type(v0), "stack"):
values = type(v0).stack(values)
else:
raise ValueError("Unsupported type {} for stack.".format(type(v0)))
ret.set(k, values)
return ret
def __str__(self):
s = self.__class__.__name__ + "("
s += "fields=[{}]".format(", ".join((f"{k}: {v}" for k, v in self._fields.items())))
return s
__repr__ = __str__
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .default import DefaultTrainer, default_setup
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import math
import os
import time
from collections import OrderedDict
from typing import Callable, Optional
import oneflow as flow
from omegaconf import OmegaConf
from termcolor import colored
from libai.config import LazyConfig, instantiate, try_get_key
from libai.data import Instance
from libai.engine import hooks
from libai.engine.trainer import EagerTrainer, GraphTrainer, TrainerBase
from libai.evaluation import inference_on_dataset, print_csv_format
from libai.models import build_graph, build_model
from libai.optim import build_optimizer
from libai.scheduler import build_lr_scheduler
from libai.tokenizer import build_tokenizer
from libai.utils import distributed as dist
from libai.utils.checkpoint import Checkpointer
from libai.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
from libai.utils.logger import setup_logger
# --------------------------------------------------------
# References:
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py
# --------------------------------------------------------
def _highlight(code, filename):
try:
import pygments
except ImportError:
return code
from pygments.formatters import Terminal256Formatter
from pygments.lexers import Python3Lexer, YamlLexer
lexer = Python3Lexer() if filename.endswith(".py") else YamlLexer()
code = pygments.highlight(code, lexer, Terminal256Formatter(style="monokai"))
return code
def _check_batch_size(cfg):
train_micro_batch_size = try_get_key(cfg, "train.train_micro_batch_size", default=None)
global_batch_size = try_get_key(cfg, "train.global_batch_size", default=None)
num_accumulation_steps = try_get_key(cfg, "train.num_accumulation_steps", default=None)
if train_micro_batch_size is not None and global_batch_size is not None:
if num_accumulation_steps is None:
if global_batch_size % (train_micro_batch_size * dist.get_data_parallel_size()) != 0:
raise ValueError(
f"global_batch_size {global_batch_size} must be divisible by "
"train_micro_batch_size * data_parallel_size "
f"({train_micro_batch_size} * {dist.get_data_parallel_size()})"
)
cfg.train.num_accumulation_steps = global_batch_size // (
train_micro_batch_size * dist.get_data_parallel_size()
)
else:
if (
global_batch_size
!= train_micro_batch_size * dist.get_data_parallel_size() * num_accumulation_steps
):
raise ValueError(
f"global_batch_size {global_batch_size} must equal to "
"train_micro_batch_size * data_parallel_size * num_accumulation_steps "
f"({train_micro_batch_size} * {dist.get_data_parallel_size()} * {num_accumulation_steps})" # noqa
)
elif train_micro_batch_size is not None and global_batch_size is None:
if num_accumulation_steps is None:
cfg.train.num_accumulation_steps = 1
cfg.train.global_batch_size = (
train_micro_batch_size
* dist.get_data_parallel_size()
* cfg.train.num_accumulation_steps
)
elif train_micro_batch_size is None and global_batch_size is not None:
if num_accumulation_steps is None:
cfg.train.num_accumulation_steps = 1
if (
global_batch_size % (dist.get_data_parallel_size() * cfg.train.num_accumulation_steps)
!= 0
):
raise ValueError(
f"global_batch_size {global_batch_size} must be divisible by "
"data_parallel_size * num_accumulation_steps "
f"({dist.get_data_parallel_size()} * {cfg.train.num_accumulation_steps})"
)
cfg.train.train_micro_batch_size = global_batch_size // (
dist.get_data_parallel_size() * cfg.train.num_accumulation_steps
)
else:
raise ValueError("train_micro_batch_size and global_batch_size must be set either")
# Set total training samples.
cfg.train.samples = cfg.train.train_iter * cfg.train.global_batch_size
def _compile_dependencies():
logger = logging.getLogger(__name__)
# =========================
# Compile dataset C++ code.
# =========================
# TODO: move this to ninja
if dist.get_local_rank() == 0:
start_time = time.time()
logger.info("> compiling dataset index builder ...")
from libai.data.data_utils import compile_helper
compile_helper()
logger.info(
">>> done with dataset index builder. Compilation time: {:.3f} "
"seconds".format(time.time() - start_time)
)
dist.synchronize()
if dist.get_local_rank() == 0:
logger.info(
">>> done with compiling. "
"Compilation time: {:.3f} seconds".format(time.time() - start_time)
)
def default_setup(cfg, args):
"""
Perform some basic common setups at the beginning of a job, including:
1. Set up the libai logger
2. Log basic information about environment, cmdline arguments, and config
3. Setup the distributed environment
4. Setup tokenizer if it's an NLP related task
5. Check batch_size
6. Backup the config to the output directory
7. Compile dependencies
Args:
args (argparse.NameSpace): the command line arguments to be logged
"""
output_dir = try_get_key(cfg, "train.output_dir")
if dist.is_main_process() and output_dir:
os.makedirs(output_dir, exist_ok=True)
cfg.train.resume = args.resume
rank = dist.get_rank()
logger = setup_logger(output_dir, distributed_rank=rank)
logger.info("Rank of current process: {}. World size: {}".format(rank, dist.get_world_size()))
logger.info("Command line arguments: " + str(args))
if hasattr(args, "config_file") and args.config_file != "":
logger.info(
"Contents of args.config_file={}:\n{}".format(
args.config_file,
_highlight(open(args.config_file, "r").read(), args.config_file),
)
)
dist.setup_dist_util(cfg.train.dist)
_check_batch_size(cfg)
if dist.is_main_process() and output_dir:
# Note: some of our scripts may expect the existence of
# config.yaml in output directory
path = os.path.join(output_dir, "config.yaml")
LazyConfig.save(cfg, path)
logger.info("Full config saved to {}".format(path))
flow.boxing.nccl.set_fusion_threshold_mbytes(
try_get_key(cfg, "train.nccl_fusion_threshold_mb", default=16)
)
flow.boxing.nccl.set_fusion_max_ops_num(
try_get_key(cfg, "train.nccl_fusion_max_ops", default=24)
)
_compile_dependencies()
class DefaultTrainer(TrainerBase):
"""
A trainer with default training logic. Compared to `TrainerBase`, it
also contains the following logic:
1. Create model, optimizer, scheduler, dataloader from the given config.
2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists.
3. Register a few common hooks defined by the config.
With standard features, it is created to simplify the **standard model training workflow** and
reduce code boilerplate for users who only need the standard training workflow.
It means this class makes **many assumptions** about your training logic that
may easily become invalid in a new research. In fact, any assumptions beyond those made in the
:class:`TrainerBase` are too much for research.
The code of this class has been annotated about restrictive assumptions it made.
When they do not work for you, you're encouraged to:
1. Overwrite methods of this class, OR:
2. Use :class:`TrainerBase`, which only does minimal SGD training and
nothing else. You can then add your own hooks if needed. OR:
3. Write your own training loop similar to ``tools/train_net.py``.
Also note that the behavior of this class, like other functions/classes in
this file, is not stable, since it is meant to represent the "common default behavior".
It is only guaranteed to work well with the standard models and training workflow in libai.
To obtain more stable behavior, write your own training logic with other public APIs.
Examples:
.. code-block:: python
trainer = DefaultTrainer(cfg)
trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS
trainer.train()
Attributes:
scheduler:
checkpointer (Checkpointer):
cfg (omegaconf.dictconfig.DictConfig):
"""
def __init__(self, cfg):
"""
Args:
cfg (omegaconf.dictconfig.DictConfig):
"""
super().__init__()
self.cfg = cfg
logger = logging.getLogger("libai")
# setup_logger is not called for LiBai
if not logger.isEnabledFor(logging.INFO):
setup_logger()
# Initialize tokenizer
self.tokenizer = self.build_tokenizer(cfg)
self.start_iter = 0
if cfg.train.resume:
save_file = os.path.join(cfg.train.output_dir, "last_checkpoint")
try:
with open(save_file, "r") as f:
last_saved = f.read().strip()
assert (
last_saved != "model_final"
), "model training has finished, check your model in train.output_dir"
self.start_iter = int(last_saved.split("_")[-1]) + 1
except IOError:
# If file doesn't exist, maybe because it has just been deleted.
# We just set start_iter to 0.
self.start_iter = 0
if cfg.graph.enabled:
cfg.dataloader.consumed_samples = self.start_iter * cfg.train.global_batch_size
else:
cfg.dataloader.consumed_samples = (
self.start_iter * cfg.train.global_batch_size // cfg.train.num_accumulation_steps
)
self.train_loader = None
self.test_loader = []
train_loader, val_loader, test_loader = self.build_train_loader(cfg, self.tokenizer)
self.train_loader = train_loader
if val_loader is not None:
self.test_loader.append(val_loader)
if test_loader is not None:
self.test_loader.append(test_loader)
self.test_loader.extend(self.build_test_loader(cfg, self.tokenizer))
if cfg.train.rdma_enabled:
# set rdma
flow.env.init_rdma()
# Automatically scale the hyperparams
self.auto_scale_hyperparams(cfg, self.train_loader)
# Assume these objects must be constructed in this order.
dist.synchronize()
start_time = time.time()
logger.info("> Start building model...")
self.model = self.build_model(cfg)
dist.synchronize()
logger.info(
">>> done with building model. "
"Building time: {:.3f} seconds".format(time.time() - start_time)
)
self.optimizer = self.build_optimizer(cfg, self.model)
self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer)
if cfg.graph.enabled:
self.graph_train = self.build_graph(
cfg, self.model, self.optimizer, self.lr_scheduler, is_train=True
)
self.graph_eval = self.build_graph(cfg, self.model, is_train=False)
self._trainer = GraphTrainer(
self.graph_train, self.train_loader, cfg.train.num_accumulation_steps
)
else:
self._trainer = EagerTrainer(
self.model, self.train_loader, self.optimizer, cfg.train.num_accumulation_steps
)
# Assume no other objects need to be checkpointed.
# We can later make it checkpoint the stateful hooks
if cfg.graph.enabled:
self.checkpointer = Checkpointer(
# Assume you want to save checkpoints together with logs/statistics
self.model,
cfg.train.output_dir,
# In static graph mode, optimizer and scheduler state_dict will
# be saved with graph.state_dict().
graph=self.graph_train,
# We print lr by `LRScheduler` hook, so we need to save/load eager lr_scheduler,
# otherwise, lr will be reset to initial state when resuming training.
lr_scheduler=self.lr_scheduler,
)
else:
self.checkpointer = Checkpointer(
# Assume you want to save checkpoints together with logs/statistics
self.model,
cfg.train.output_dir,
optimizer=self.optimizer,
lr_scheduler=self.lr_scheduler,
)
# Loading checkpoint before dataloader construction, because
# dataloader needs to know the consumed iterations from
# the last breakpoint.
self.resume_or_load(cfg.train.resume)
cfg.train.start_iter = self.start_iter
# global_batch_size = micro_batch_size * num_gpus * num_accumulation_steps
# When using gradient accumulation in graph mode, each run_step
# handle `global_batch_size` samples.
# When using gradient accumulation in eager mode, each run_step just handle
# `micro_batch_size * num_gpus` samples, so we need to divide `num_accumulation_steps`
# to get the actual `batch_size` for computing `throughput` and `consumed_samples`
self.global_batch_size = (
cfg.train.global_batch_size
if cfg.graph.enabled
else cfg.train.global_batch_size // cfg.train.num_accumulation_steps
)
self.max_iter = cfg.train.train_iter
self.register_hooks(self.build_hooks())
def resume_or_load(self, resume=True):
"""
If `resume==True` and `cfg.train.output_dir` contains the last checkpoint (defined by
a `last_checkpoint` file), resume from the file. Resuming means loading all
available states (eg. optimizer and scheduler) and update iteration counter
from the checkpoint. ``cfg.train.load_weight`` will not be used.
Otherwise, this is considered as an independent training. The method will load model
weights from the file ``cfg.train.load_weight`` (but will not load other states) and start
from iteration 0.
Args:
resume (bool): whether to do resume or not
"""
weight_path = self.cfg.train.load_weight
assert isinstance(
weight_path, str
), f"cfg.train.load_weight:{self.cfg.train.load_weight} must be string"
if resume:
assert self.checkpointer.has_checkpoint()
# The checkpoint stores the training iteration that just finished, thus we start
# at the next iteration (or iter zero if there's no checkpoint).
assert self.start_iter == (
self.checkpointer.resume_or_load(None, resume=True).get("iter", -1) + 1
)
elif len(weight_path) != 0:
assert os.path.isdir(
weight_path
), f"cfg.train.load_weight:{self.cfg.train.load_weight} must be directory"
self.checkpointer.load(weight_path, checkpointables=[])
def build_hooks(self):
"""
Build a list of default hooks, including timing, evaluation,
checkpointing, lr scheduling, precise BN, writing events.
Returns:
list[HookBase]:
"""
ret = [
hooks.IterationTimer(),
hooks.LRScheduler(), # for beauty lr scheduler printer in `nn.Graph` mode
hooks.PeriodicCheckpointer(
self.checkpointer,
self.cfg.train.checkpointer.period,
max_to_keep=self.cfg.train.checkpointer.max_to_keep,
),
]
if self.cfg.train.evaluation.enabled:
assert self.cfg.train.evaluation.eval_iter > 0, "run_iter must be positive number"
def test_and_save_results():
model = self.graph_eval if self.cfg.graph.enabled else self.model
self._last_eval_results = self.test(self.cfg, self.test_loader, model)
return self._last_eval_results
ret.append(hooks.EvalHook(self.cfg.train.evaluation.eval_period, test_and_save_results))
ret.append(
hooks.BestCheckpointer(
self.cfg.train.evaluation.eval_period,
self.checkpointer,
val_metric=try_get_key(
self.cfg, "train.evaluation.eval_metric", default="Acc@1"
),
mode=try_get_key(self.cfg, "train.evaluation.eval_mode", default="max"),
)
)
if dist.is_main_process():
# run writers in the end, so that evaluation metrics are written
ret.append(hooks.PeriodicWriter(self.build_writers(), self.cfg.train.log_period))
return ret
def build_writers(self):
"""
Build a list of writers to be used. By default it contains
writers that write metrics to the screen,
a json file, and a tensorboard event file respectively.
If you'd like a different list of writers, you can overwrite it in
your trainer.
Returns:
list[EventWriter]: a list of :class:`EventWriter` objects.
It is now implemented by:
.. code-block:: python
return [
CommonMetricPrinter(self.global_batch_size, self.max_iter),
JSONWriter(os.path.join(self.cfg.train.output_dir, "metrics.json")),
TensorboardXWriter(self.cfg.train.output_dir),
]
"""
# Assume the default print/log frequency.
return [
# It may not always print what you want to see, since it prints "common" metrics only.
CommonMetricPrinter(self.global_batch_size, self.max_iter),
JSONWriter(os.path.join(self.cfg.train.output_dir, "metrics.json")),
TensorboardXWriter(self.cfg.train.output_dir),
]
def train(self):
"""
Run training.
Returns:
OrderedDict of results, if evaluation is enabled. Otherwise None.
"""
super().train(self.start_iter, self.max_iter)
def run_step(self):
self._trainer.iter = self.iter
self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device)
@classmethod
def get_batch(
cls,
data: Instance,
input_placement_device: str = "cuda",
mixup_func: Optional[Callable] = None,
):
"""
Convert batched local tensor to distributed tensor for model step running.
If you want to do something with batched data before model, (e.g. mixup),
you can rewrite this function.
"""
if isinstance(data, flow.utils.data._utils.worker.ExceptionWrapper):
data.reraise()
if mixup_func is not None:
images, labels = mixup_func(
data.get("images").tensor.cuda(),
data.get("labels").tensor.cuda(),
)
data.get("images").tensor = images
data.get("labels").tensor = labels
ret_dict = {}
for key, value in data.get_fields().items():
value.to_global(device_type=input_placement_device)
ret_dict[key] = value.tensor
return ret_dict
@classmethod
def build_tokenizer(cls, cfg):
"""
Returns:
libai.tokenizer.PreTrainedTokenizer:
It now calls :func:`libai.tokenizer.build_tokenizer`.
"""
tokenizer = None
if try_get_key(cfg, "tokenization") is not None:
tokenizer = build_tokenizer(cfg.tokenization)
# FIXME(lxy): In case model is not defined with cfg, the `vocab_size` can be
# accessed by `model.vocab_size`.
if try_get_key(cfg, "model.cfg.vocab_size", default=None) is not None:
# In case the model does not need vocab_size as argument
multiple = (
cfg.tokenization.make_vocab_size_divisible_by
* cfg.train.dist.tensor_parallel_size
)
cfg.model.cfg.vocab_size = tokenizer.padded_vocab_size(multiple)
return tokenizer
@classmethod
def build_model(cls, cfg):
"""
Returns:
flow.nn.Module:
It now calls :func:`libai.models.build_model`.
Overwrite it if you'd like a different model.
"""
assert try_get_key(cfg, "model") is not None, "cfg must contain `model` namespace"
# Set model fp16 option because of embedding layer `white_identity` manual
# insert for amp training if provided.
if try_get_key(cfg.model, "cfg.amp_enabled") is not None:
cfg.model.cfg.amp_enabled = cfg.train.amp.enabled and cfg.graph.enabled
# In case some model define without cfg keyword.
elif try_get_key(cfg.model, "amp_enabled") is not None:
cfg.model.amp_enabled = cfg.train.amp.enabled and cfg.graph.enabled
model = build_model(cfg.model)
logger = logging.getLogger(__name__)
logger.info("Model:\n{}".format(model))
model._apply(dist.convert_to_distributed_default_setting)
return model
@classmethod
def build_graph(cls, cfg, model, optimizer=None, lr_scheduler=None, is_train=True):
assert try_get_key(cfg, "graph") is not None, "cfg must contain `graph` namespace"
graph = build_graph(cfg, model, optimizer, lr_scheduler, is_train)
debug_graph = try_get_key(cfg, "graph.debug", default=-1)
if debug_graph >= 0:
logger = logging.getLogger(__name__)
logger.info("Graph debug mode on, automatically output debug info.")
graph.debug(cfg.graph.debug)
return graph
@classmethod
def build_optimizer(cls, cfg, model):
"""
Returns:
flow.optim.Optimizer:
It now calls :func:`libai.optim.build_optimizer`.
Overwrite it if you'd like a different optimizer.
"""
assert try_get_key(cfg, "optim") is not None, "cfg must contain `optim` namespace"
return build_optimizer(cfg.optim, model)
@classmethod
def build_lr_scheduler(cls, cfg, optimizer):
"""
It now calls :func:`libai.scheduler.build_lr_scheduler`.
Overwrite it if you'd like a different scheduler.
"""
assert (
try_get_key(cfg, "train.scheduler") is not None
), "cfg.train must contain `scheduler` namespace"
return build_lr_scheduler(cfg.train.scheduler, optimizer)
@classmethod
def build_train_loader(cls, cfg, tokenizer=None):
"""
Returns:
iterable
It now calls :func:`libai.data.build_train_valid_test_loader`.
Overwrite it if you'd like a different data loader.
"""
assert (
try_get_key(cfg, "dataloader.train") is not None
), "cfg must contain `dataloader.train` namespace"
logger = logging.getLogger(__name__)
logger.info("Prepare training, validating, testing set")
if cfg.graph.enabled:
# In static graph mode, data will be sliced in nn.Graph automatically,
# dataloader will get micro-batch-size and data will be concated
# in graph_trainer.run_step to get mini-batch-size.
cfg.dataloader.train.train_batch_size = cfg.train.train_micro_batch_size
else:
# In eager mode, gradient accumulation will act like PyTorch, so dataloader
# will get micro-batch-size
cfg.dataloader.train.train_batch_size = cfg.train.train_micro_batch_size
cfg.dataloader.train.test_batch_size = cfg.train.test_micro_batch_size
cfg.dataloader.train.seed = cfg.train.seed
# used by nlp dataloader
if hasattr(cfg.dataloader.train, "train_val_test_num_samples"):
eval_iter = (
(cfg.train.train_iter // cfg.train.evaluation.eval_period + 1)
* cfg.train.evaluation.eval_iter
if cfg.train.evaluation.enabled
# samples for test_dataset must be larger than 0 even if there is no evaluation
else 1
)
test_iter = cfg.train.evaluation.eval_iter if cfg.train.evaluation.enabled else 1
cfg.dataloader.train.train_val_test_num_samples = [
int(cfg.train.samples),
int(eval_iter * cfg.train.test_micro_batch_size * dist.get_data_parallel_size()),
int(test_iter * cfg.train.test_micro_batch_size * dist.get_data_parallel_size()),
]
if OmegaConf.is_list(cfg.dataloader.train.dataset):
for dataset in cfg.dataloader.train.dataset:
if hasattr(dataset, "seed"):
dataset.seed = cfg.train.seed
else:
dataset = cfg.dataloader.train.dataset
if hasattr(dataset, "seed"):
dataset.seed = cfg.train.seed
# Set tokenizer for each dataset
if tokenizer:
if OmegaConf.is_list(cfg.dataloader.train.dataset):
for dataset in cfg.dataloader.train.dataset:
dataset.tokenizer = tokenizer
else:
cfg.dataloader.train.dataset.tokenizer = tokenizer
train_loader, valid_loader, test_loader = instantiate(
cfg.dataloader.train, _recursive_=False
)
return train_loader, valid_loader, test_loader
@classmethod
def build_test_loader(cls, cfg, tokenizer=None):
"""
Returns:
iterable
It now calls :func:`libai.data.build_image_test_loader` for CV tasks
or :func:`libai.data.build_nlp_test_loader` for NLP tasks.
Overwrite it if you'd like a different data loader.
"""
# If there is no test_loader, just return []
if not try_get_key(cfg, "dataloader.test", default=False):
return []
logger = logging.getLogger(__name__)
logger.info("Prepare testing set")
assert OmegaConf.is_list(
cfg.dataloader.test
), f"dataloader.test must be list but got type of {type(cfg.dataloader.test)}"
for i in range(len(cfg.dataloader.test)):
cfg.dataloader.test[i].test_batch_size = cfg.train.test_micro_batch_size
cfg.dataloader.test[i].seed = cfg.train.seed # set seed
if tokenizer:
cfg.dataloader.test[i].dataset.tokenizer = tokenizer
# list[dataloader1, dataloader2, ...]
test_loader = instantiate(cfg.dataloader.test, _recursive_=False)
return test_loader
@classmethod
def auto_scale_hyperparams(cls, cfg, data_loader):
logger = logging.getLogger(__name__)
log_info = ""
# Get or set default iteration cfg
train_iter = try_get_key(cfg, "train.train_iter", default=0)
train_epoch = try_get_key(cfg, "train.train_epoch", default=0)
warmup_ratio = try_get_key(cfg, "train.warmup_ratio", default=0)
assert (
warmup_ratio < 1 and warmup_ratio >= 0
), "warmup_ratio must be in [0, 1) that presents the ratio of warmup iter to the train iter"
# Automatically scale iteration num depend on the settings
# The total iters in one epoch is `len(dataset) / global_batch_size`
cfg.train.train_iter = max(
math.ceil(len(data_loader.dataset) * train_epoch / cfg.train.global_batch_size),
train_iter,
)
cfg.train.warmup_iter = math.ceil(cfg.train.train_iter * cfg.train.warmup_ratio)
if not cfg.graph.enabled:
# In eager mode, dataloader only get micro-batch-size each iter,
# which is mini-batch-size // num_accumulation, so scale `train_iter`
# and `warmup_iter` to be consistent with static graph mode.
cfg.train.train_iter *= cfg.train.num_accumulation_steps
cfg.train.warmup_iter *= cfg.train.num_accumulation_steps
log_info += "Auto-scaling the config to train.train_iter={}, train.warmup_iter={}".format(
cfg.train.train_iter, cfg.train.warmup_iter
)
# Automatically scale the milestones
if try_get_key(cfg, "train.scheduler.milestones"):
if len(
[
milestone
for milestone in cfg.train.scheduler.milestones
if milestone < 0 or milestone >= 1
]
):
raise ValueError(
"milestones should be a list of increasing ratio in [0, 1), but got {}".format(
cfg.train.scheduler.milestones
)
)
cfg.train.scheduler.milestones = [
int(milestone * cfg.train.train_iter)
for milestone in cfg.train.scheduler.milestones
]
log_info += f", scheduler milestones={cfg.train.scheduler.milestones}"
logger.info(log_info)
# Global scheduler cfg
cfg.train.scheduler.warmup_iter = cfg.train.warmup_iter
cfg.train.scheduler.max_iter = cfg.train.train_iter
# train iter per epoch
iter_per_epoch = len(data_loader.dataset) // cfg.train.global_batch_size
# rescale eval period
if try_get_key(cfg, "train.evaluation.eval_after_n_epoch"):
cfg.train.evaluation.eval_period = (
iter_per_epoch * cfg.train.evaluation.eval_after_n_epoch
)
logger.info(
f"Auto-scaling the config "
f"train.evaluation.eval_after_n_epoch={cfg.train.evaluation.eval_after_n_epoch} "
f"to train.evaluation.eval_period={cfg.train.evaluation.eval_period}"
)
# rescale save model period
if try_get_key(cfg, "train.checkpointer.save_model_after_n_epoch"):
cfg.train.checkpointer.period = (
iter_per_epoch * cfg.train.checkpointer.save_model_after_n_epoch
)
logger.info(
f"Auto-scaling the config "
f"train.checkpointer.save_model_after_n_epoch="
f"{cfg.train.checkpointer.save_model_after_n_epoch} "
f"to train.checkpointer.period={cfg.train.checkpointer.period}"
)
@classmethod
def build_evaluator(cls, cfg):
evaluator = instantiate(cfg.train.evaluation.evaluator)
return evaluator
@classmethod
def test(cls, cfg, test_loaders, model, evaluator=None):
"""
Evaluate the given model. The given model is expected to already contain
weights to evaluate.
Args:
cfg (CfgNode):
test_loaders: list [dataloader1, dataloader2, ...]
model (nn.Graph):
evaluators (list[DatasetEvaluator] or None): if None, will call
:meth:`build_evaluator`. Otherwise, must have the same length as
``cfg.DATASETS.TEST``.
Returns:
dict: a dict of result metrics
"""
logger = logging.getLogger(__name__)
# TODO: support multi evaluator
# if isinstance(evaluators, DatasetEvaluator):
# evaluators = [evaluators]
test_batch_size = cfg.train.test_micro_batch_size * dist.get_data_parallel_size()
evaluator = cls.build_evaluator(cfg) if not evaluator else evaluator
results = OrderedDict()
for idx, data_loader in enumerate(test_loaders):
# When evaluators are passed in as arguments,
# implicitly assume that evaluators can be created before data_loader.
dataset_name = type(data_loader.dataset).__name__
# TODO: support multi evaluator
# if evaluators is not None:
# evaluator = evaluators[idx]
# else:
# try:
# evaluator = cls.build_evaluator(cfg)
# except NotImplementedError:
# logger.warn(
# "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
# "or implement its `build_evaluator` method."
# )
# results[dataset_name] = {}
# continue
results_i = inference_on_dataset(
model,
data_loader,
test_batch_size,
cfg.train.evaluation.eval_iter,
cls.get_batch,
cfg.train.input_placement_device,
evaluator,
)
results[dataset_name] = results_i
if dist.is_main_process():
assert isinstance(
results_i, dict
), "Evaluator must return a dict on the main process. Got {} instead.".format(
results_i
)
logger.info(
"Evaluation results for {} in csv format:".format(
colored(dataset_name, "green")
)
)
print_csv_format(results_i)
if len(results) == 1:
results = list(results.values())[0]
return results
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import logging
import math
import operator
import time
from collections import Counter
import oneflow as flow
from libai.evaluation import flatten_results_dict
from libai.utils import distributed as dist
from libai.utils.checkpoint import Checkpointer
from libai.utils.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer
from libai.utils.events import EventWriter
from libai.utils.timer import Timer
from .trainer import HookBase
# --------------------------------------------------------
# References:
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/hooks.py
# --------------------------------------------------------
"""
Implement some common hooks.
"""
logger = logging.getLogger(__name__)
class CallbackHook(HookBase):
"""
Create a hook using callback functions provided by the user.
"""
def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None):
"""
Each argument is a function that takes one argument: the trainer.
"""
self._before_train = before_train
self._before_step = before_step
self._after_step = after_step
self._after_train = after_train
def before_train(self):
if self._before_train:
self._before_train(self.trainer)
def after_train(self):
if self._after_train:
self._after_train(self.trainer)
# The functions may be closures that hold reference to the trainer
# Therefore, delete them to avoid circular reference.
del self._before_train, self._after_train
del self._before_step, self._after_step
def before_step(self):
if self._before_step:
self._before_step(self.trainer)
def after_step(self):
if self._after_step:
self._after_step(self.trainer)
class IterationTimer(HookBase):
"""
Track the time spent for each iteration (each run_step call in the trainer).
Print a summary in the end of training.
This hook uses the time between the call to its :meth:`before_step`
and :meth:`after_step` methods.
Under the convention that :meth:`before_step` of all hooks should only
take negligible amount of time, the :class:`IterationTimer` hook should be
placed at the beginning of the list of hooks to obtain accurate timing.
"""
def __init__(self, warmup_iter=3):
"""
Args:
warmup_iter (int): the number of iterations at the beginning to exclude
from timing.
"""
self._warmup_iter = warmup_iter
self._step_timer = Timer()
def before_train(self):
self._start_time = time.perf_counter()
self._total_timer = Timer()
self._total_timer.pause()
def after_train(self):
total_time = time.perf_counter() - self._start_time
total_time_minus_hooks = self._total_timer.seconds()
hook_time = total_time - total_time_minus_hooks
num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter
if num_iter > 0 and total_time_minus_hooks > 0:
# Speed is meaningful only after warmup
# NOTE this format is parsed by grep in some scripts
logger.info(
"Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
num_iter,
str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
total_time_minus_hooks / num_iter,
)
)
logger.info(
"Total training time: {} ({} on hooks)".format(
str(datetime.timedelta(seconds=int(total_time))),
str(datetime.timedelta(seconds=int(hook_time))),
)
)
def before_step(self):
self._step_timer.reset()
self._total_timer.resume()
def after_step(self):
# +1 because we're in after_step
iter_done = self.trainer.iter - self.trainer.start_iter + 1
if iter_done >= self._warmup_iter:
sec = self._step_timer.seconds()
self.trainer.storage.put_scalars(time=sec)
else:
self._start_time = time.perf_counter()
self._total_timer.reset()
self._total_timer.pause()
class PeriodicWriter(HookBase):
"""
Write events to EventStorage periodically.
It is executed every ``period`` iterations and after the last iteration.
"""
def __init__(self, writers, period=20):
"""
Args:
writers (list[EventWriter]): a list of EventWriter objects
period (int):
"""
self._writers = writers
for w in writers:
assert isinstance(w, EventWriter), w
self._period = period
def after_step(self):
if (self.trainer.iter + 1) % self._period == 0 or (
self.trainer.iter == self.trainer.max_iter - 1
):
for writer in self._writers:
writer.write()
def after_train(self):
for writer in self._writers:
writer.close()
class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase):
"""
Same as :class:`libai.utils.checkpoint.PeriodicCheckpointer`, but as a hook.
Note that when used as a hook,
it is unable to save additional data other than what's defined
by the given `checkpointer`.
It is executed every ``period`` iterations and after the last iteration.
"""
def before_train(self):
self.max_iter = self.trainer.max_iter
def after_step(self):
self.step(self.trainer.iter)
class BestCheckpointer(HookBase):
"""
Checkpoints best weights based off given metric.
This hook should be used in conjunction to and executed after the hook
that produces the metric, e.g. `EvalHook`.
"""
def __init__(
self,
eval_period: int,
checkpointer: Checkpointer,
val_metric: str,
mode: str = "max",
file_prefix: str = "model_best",
) -> None:
"""
Args:
eval_period (int): the period `EvalHook` is set to run.
checkpointer: the checkpointer object used to save checkpoints.
val_metric (str): validation metric to track for best checkpoint, e.g. "acc@1"
mode (str): one of {'max', 'min'}. controls whether the chosen val metric should be
maximized or minimized, e.g. for "acc@1" it should be "max"
file_prefix (str): the prefix of checkpoint's filename, defaults to "model_best"
"""
self._period = eval_period
self._val_metric = val_metric
assert mode in [
"max",
"min",
], f'Mode "{mode}" to `BestCheckpointer` is unknown. It should be one of {"max", "min"}.'
if mode == "max":
self._compare = operator.gt
else:
self._compare = operator.lt
self._checkpointer = checkpointer
self._file_prefix = file_prefix
self.best_metric = None
self.best_iter = None
def _update_best(self, val, iteration):
if math.isnan(val) or math.isinf(val):
return False
self.best_metric = val
self.best_iter = iteration
return True
def _best_checking(self):
metric_tuple = self.trainer.storage.latest().get(self._val_metric)
flag = flow.zeros(1)
if dist.is_main_process():
if metric_tuple is None:
logger.warning(
f"Given val metric {self._val_metric} does not seem to be computed/stored. "
"Will not be checkpointed based on that."
)
else:
latest_metric, metric_iter = metric_tuple
if self.best_metric is None:
if self._update_best(latest_metric, metric_iter):
flag = flag + 1
logger.info(
f"Saved first model at {self.best_metric:0.5f} @ {self.best_iter} steps"
)
elif self._compare(latest_metric, self.best_metric):
flag = flag + 1
logger.info(
f"Saved best model as latest eval score for {self._val_metric} is "
f"{latest_metric:0.5f}, better than last best score "
f"{self.best_metric:0.5f} @ iteration {self.best_iter}."
)
self._update_best(latest_metric, metric_iter)
else:
logger.info(
f"Not saving as latest eval score for "
f"{self._val_metric} is {latest_metric:0.5f}, "
f"not better than best score {self.best_metric:0.5f} "
f"@ iteration {self.best_iter}."
)
dist.synchronize()
flag = flag.to_global(
sbp=flow.sbp.broadcast, placement=flow.env.all_device_placement("cpu")
)
if flag.to_local().item() == 1:
self._checkpointer.save(f"{self._file_prefix}")
def after_step(self):
# same conditions as `EvalHook`
next_iter = self.trainer.iter + 1
if (
self._period > 0
and next_iter % self._period == 0
and next_iter != self.trainer.max_iter
):
self._best_checking()
def after_train(self):
# same conditions as `EvalHook`
if self.trainer.iter + 1 >= self.trainer.max_iter:
self._best_checking()
class EvalHook(HookBase):
"""
Run an evaluation function periodically, and at the end of training.
It is executed every ``eval_period`` iterations and after the last iteration.
"""
def __init__(self, eval_period, eval_function):
"""
Args:
eval_period (int): the period to run `eval_function`.
eval_function (callable): a function which takes no arguments, and
returns a nested dict of evaluation metrics.
Note:
This hook must be enabled in all or none workers.
If you would like only certain workers to perform evaluation,
give other workers a no-op function (`eval_function=lambda: None`).
"""
self._period = eval_period
self._func = eval_function
def _do_eval(self):
results = self._func()
if results:
assert isinstance(
results, dict
), "Eval function must return a dict. Got {} instead.".format(results)
flattened_results = flatten_results_dict(results)
# fixme: flatten_results_dict is not defined
for k, v in flattened_results.items():
try:
v = float(v)
except Exception:
raise ValueError(
"[EvalHook] eval_function should return a nested dict of float. "
"Got '{}: {}' instead.".format(k, v)
)
self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
# Evaluation may take different time among workers.
# A barrier make them start the next iteration together.
dist.synchronize()
def after_step(self):
next_iter = self.trainer.iter + 1
if self._period > 0 and next_iter % self._period == 0:
# do the last eval in after_train
if next_iter != self.trainer.max_iter:
self._do_eval()
def after_train(self):
# This condition is to prevent the eval from running after a failed training
if self.trainer.iter + 1 >= self.trainer.max_iter:
self._do_eval()
# func is likely a closure that holds reference to the trainer
# therefore we clean it to avoid circular reference in the end
del self._func
class LRScheduler(HookBase):
"""
A hook which executes a oneflow builtin LR scheduler and summarizes the LR.
It is executed after every iteration.
"""
def __init__(self, optimizer=None, scheduler=None):
"""
Args:
optimizer (flow.optim.Optimizer):
scheduler (flow.optim.LRScheduler):
if a :class:`ParamScheduler` object, it defines the multiplier over the base LR
in the optimizer.
If any argument is not given, will try to obtain it from the trainer.
"""
self._optimizer = optimizer
self._scheduler = scheduler
def before_train(self):
self._optimizer = self._optimizer or self.trainer.optimizer
self._best_param_group_id = LRScheduler.get_best_param_group_id(self._optimizer)
@staticmethod
def get_best_param_group_id(optimizer):
# NOTE: some heuristics on what LR to summarize
# summarize the param group with most parameters
largest_group = max(len(g["params"]) for g in optimizer.state_dict()["param_groups"])
if largest_group == 1:
# If all groups have one parameter,
# then find the most common initial LR, and use it for summary
lr_count = Counter(
[g["_options"]["lr"] for g in optimizer.state_dict()["param_groups"]]
)
lr = lr_count.most_common()[0][0]
for i, g in enumerate(optimizer.state_dict()["param_groups"]):
if g["_options"]["lr"] == lr:
return i
else:
for i, g in enumerate(optimizer.state_dict()["param_groups"]):
if len(g["params"]) == largest_group:
return i
def after_step(self):
lr = self.scheduler.get_last_lr()[self._best_param_group_id]
self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False)
self.scheduler.step()
@property
def scheduler(self):
return self._scheduler or self.trainer.lr_scheduler
def state_dict(self):
if isinstance(self.scheduler, flow.optim.lr_scheduler._LRScheduler):
return self.scheduler.state_dict()
return {}
def load_state_dict(self, state_dict):
if isinstance(self.scheduler, flow.optim.lr_scheduler._LRScheduler):
logger.info("Loading scheduler from state_dict ...")
self.scheduler.load_state_dict(state_dict)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
import weakref
from typing import Callable, List, Mapping
import oneflow as flow
from libai.utils import distributed as dist
from libai.utils.events import EventStorage, get_event_storage
# --------------------------------------------------------
# References:
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/train_loop.py
# --------------------------------------------------------
class HookBase:
"""
Base class for hooks that can be registered with :class:`TrainerBase`.
Each hook can implement 4 methods. The way they are called is demonstrated
in the following snippet:
::
hook.before_train()
for iter in range(start_iter, max_iter):
hook.before_step()
trainer.run_step()
hook.after_step()
iter += 1
hook.after_train()
Notes:
1. In the hook method, users can access ``self.trainer`` to access more
properties about the context (e.g., model, current iteration, or config
if using :class:`DefaultTrainer`).
2. A hook that does something in :meth:`before_step` can often be
implemented equivalently in :meth:`after_step`.
If the hook takes non-trivial time, it is strongly recommended to
implement the hook in :meth:`after_step` instead of :meth:`before_step`.
The convention is that :meth:`before_step` should only take negligible time.
Following this convention will allow hooks that do care about the difference
between :meth:`before_step` and :meth:`after_step` (e.g., timer) to
function properly.
"""
trainer: "TrainerBase" = None
"""
A weak reference to the trainer object. Set by the trainer when the hook is registered.
"""
def before_train(self):
"""
Called before the first iteration.
"""
def after_train(self):
"""
Called after the last iteration.
"""
def before_step(self):
"""
Called before each iteration.
"""
def after_step(self):
"""
Called after each iteration.
"""
class TrainerBase:
"""
Base class for iterative trainer with hooks.
The only assumption we made here is: the training runs in a loop.
A subclass can implement what the loop is.
We made no assumptions about the existence of dataloader, optimizer, model, etc.
Attributes:
iter(int): The current iteration.
start_iter(int): The iteration to start with.
By convention the minimum possible value is 0.
max_iter(int): The iteration to end training.
storage(EventStorage): An EventStorage that's opened during the course of training.
"""
def __init__(self):
self._hooks: List[HookBase] = []
self.iter: int = 0
self.start_iter: int = 0
self.max_iter: int
self.storage: EventStorage
def register_hooks(self, hooks):
"""
Register hooks to the trainer. The hooks are executed in the order
they are registered.
Args:
hooks (list[Optional[HookBase]]): list of hooks
"""
hooks = [h for h in hooks if h is not None]
for h in hooks:
assert isinstance(h, HookBase)
# To avoid circular reference, hooks and trainer cannot own each other.
# This normally does not matter, but will cause memory leak if the
# involved objects contain __del__:
# See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
h.trainer = weakref.proxy(self)
self._hooks.extend(hooks)
def train(self, start_iter: int, max_iter: int):
"""
Args:
start_iter, max_iter (int): See docs above
"""
logger = logging.getLogger(__name__)
logger.info("Starting training from iteration {}".format(start_iter))
self.iter = self.start_iter = start_iter
self.max_iter = max_iter
with EventStorage(self.start_iter) as self.storage:
try:
self.before_train()
for self.iter in range(start_iter, max_iter):
self.before_step()
self.run_step()
self.after_step()
# self.iter == max_iter can be used by `after_train` to
# tell whether the training successfully finished or failed
# due to exceptions.
self.iter += 1
except Exception:
logger.exception("Exception during training:")
raise
finally:
self.after_train()
def before_train(self):
for h in self._hooks:
h.before_train()
def after_train(self):
for h in self._hooks:
h.after_train()
def before_step(self):
self.storage.iter = self.iter
for h in self._hooks:
h.before_step()
def after_step(self):
self.storage.samples = (self.iter + 1) * self.cfg.train.global_batch_size
for h in self._hooks:
h.after_step()
def run_step(self):
raise NotImplementedError
@staticmethod
def write_metrics(
loss_dict: Mapping[str, flow.Tensor],
data_time: float,
prefix: str = "",
) -> None:
"""
Args:
loss_dict (dict): dict of scalar losses
data_time (float): time taken by the dataloader iteration
prefix (str): prefix for logging keys
"""
# get metric value, remove it to rank0 cause logger.info only work in rank0
metrics_dict = {
k: dist.tensor_to_rank0(v, device="cpu", to_local=True) for k, v in loss_dict.items()
}
metrics_dict["data_time"] = data_time
# TODO: Gather metrics among all workers for logging
# all_metrics_dict = dist.gather(metrics_dict)
all_metrics_dict = metrics_dict
if dist.is_main_process():
storage = get_event_storage()
# data_time among workers can have high variance. The actual latency
# caused by data_time is the maximum among workers.
# data_time = np.max([x.pop("data_time") for x in all_metrics_dict])
data_time = all_metrics_dict.pop("data_time")
storage.put_scalar("data_time", data_time)
# average the rest metrics
# metrics_dict = {
# k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys()
# }
metrics_dict = all_metrics_dict
total_losses_reduced = sum(v for k, v in metrics_dict.items() if "loss" in k)
storage.put_scalar("{}total_loss".format(prefix), total_losses_reduced)
if len(metrics_dict) > 1:
storage.put_scalars(**metrics_dict)
class EagerTrainer(TrainerBase):
"""
A simple eager trainer for the most common type of task:
single-cost single-optimizer single-data-source iterative optimization,
optionally using data-parallelism.
It assumes that in every step, you:
1. Compute the loss with a data from the data_loader.
2. Compute the gradients with the above loss.
3. Update the model with the optimizer.
All other tasks during training (checkpointing, logging, evaluation, LR schedule)
are maintained by hooks, which can be registered by :meth:`TrainerBase.register_hooks`.
If you want to do anything fancier than this,
either subclass TrainerBase and implement your own `run_step`,
or write your own training loop.
"""
def __init__(self, model, data_loader, optimizer, grad_acc_steps=1):
"""
Args:
model: a flow.nn.Module. Takes a data from data_loader and returns a
dict of losses.
data_loader: an iterable. Contains data to be used to call model.
optimizer: a flow optimizer.
"""
super().__init__()
# We set the model to training mode in the trainer.
# However it's valid to train a model that's in eval mode.
# If you want your model (or a submodule of it) to behave
# like evaluation during training, you can overwrite its train() method.
model.train()
self.model = model
self.data_loader = data_loader
self._data_loader_iter = iter(data_loader)
self.optimizer = optimizer
self.grad_acc_steps = grad_acc_steps
def run_step(self, get_batch: Callable, input_placement_device: str = "cuda"):
"""
Implement the standard training logic described above.
"""
assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
start = time.perf_counter()
# If you want to do something with the data, you can wrap the dataloader.
data = next(self._data_loader_iter)
data = get_batch(
data, input_placement_device, getattr(self.data_loader, "mixup_func", None)
)
data_time = time.perf_counter() - start
loss_dict = self.model(**data)
losses = sum(v for k, v in loss_dict.items() if "loss" in k) / self.grad_acc_steps
losses.backward()
self.write_metrics(loss_dict, data_time)
if (self.iter + 1) % self.grad_acc_steps == 0:
self.optimizer.clip_grad()
self.optimizer.step()
self.optimizer.zero_grad()
class GraphTrainer(TrainerBase):
"""
A simple graph trainer for training and evaluating models in a static graph mode.
"""
def __init__(self, graph, data_loader, grad_acc_steps=1):
super().__init__()
graph.model.train()
self.data_loader = data_loader
self._data_loader_iter = iter(data_loader)
self.graph = graph
self.grad_acc_steps = grad_acc_steps
self._temp_data = None
self._temp_count = 0
def run_step(self, get_batch: Callable, input_placement_device: str = "cuda"):
"""
Implement the standard training logic described above.
"""
assert self.graph.model.training, "[SimpleTrainer] model was changed to eval mode!"
start = time.perf_counter()
while self._temp_count != self.grad_acc_steps:
# If you want to do something with the data, you can wrap the dataloader.
data = next(self._data_loader_iter)
self._temp_count += 1
if self._temp_data is None:
self._temp_data = data
else:
# In static graph mode, data will be sliced in nn.Graph automatically,
# for geting mini-batch_size, we concat local_tensor first.
for key, value in data.get_fields().items():
temp_value = self._temp_data.get(key)
self._temp_data.get(key).tensor = flow.cat(
(temp_value.tensor, value.tensor), dim=0
)
data = self._temp_data
self._temp_count = 0
self._temp_data = None
data = get_batch(
data, input_placement_device, getattr(self.data_loader, "mixup_func", None)
)
data_time = time.perf_counter() - start
# If you want to do something with the losses, you can wrap the model.
loss_dict = self.graph(**data)
# Add this because when set up gradient accumulations, graph will return
# an unpacked n-d tensor whose size is accumulation step
for key, value in loss_dict.items():
if "loss" in key:
loss_dict[key] = value.mean()
else:
# NOTE: only support scalar tensor currently
loss_dict[key] = value.sum()
self.write_metrics(loss_dict, data_time)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .evaluator import DatasetEvaluator, inference_on_dataset
from .utils import print_csv_format, flatten_results_dict
from .cls_evaluator import ClsEvaluator
from .ppl_evaluator import PPLEvaluator
from .reg_evaluator import RegEvaluator
from .bleu_evaluator import BLEUEvaluator
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
from collections import OrderedDict
from nltk.translate.bleu_score import corpus_bleu
from libai.utils import distributed as dist
from .evaluator import DatasetEvaluator
class BLEUEvaluator(DatasetEvaluator):
"""
Evaluate BLEU(Bilingual Evaluation Understudy) score.
BLEU is a score for comparing a candidate translation
of text to one or more reference translations.
"""
def __init__(self):
super().__init__()
self._predictions = []
def reset(self):
self._predictions = []
def process(self, inputs, outputs):
candidate = outputs["candidate"]
reference = inputs["reference"]
self._predictions.append({"candidate": candidate, "reference": reference})
def evaluate(self):
if not dist.is_main_process():
return {}
else:
predictions = self._predictions
candidates = []
references = []
for pred in predictions:
candidates.append(pred["candidate"])
references.append(pred["reference"])
bleu_score = corpus_bleu(references, candidates)
self._results = OrderedDict()
self._results["bleu_score"] = bleu_score
return copy.deepcopy(self._results)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
from collections import OrderedDict
from libai.utils import distributed as dist
from .evaluator import DatasetEvaluator
def accuracy(output, target, topk=(1,)):
maxk = min(max(topk), output.size()[1])
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.reshape(1, -1).expand_as(pred))
return [
(correct[: min(k, maxk)].reshape(-1).float().sum(0) * 100.0 / batch_size).item()
for k in topk
]
class ClsEvaluator(DatasetEvaluator):
"""
Evaluate accuracy for classification.
The metrics range from 0 to 100 (instead of 0 to 1).
We support evaluate different topk accuracy.
You can reset `cfg.train.topk=(1, 5, N)` according to your needs.
"""
def __init__(self, topk=(1, 5)):
self.topk = topk
self._predictions = []
def reset(self):
self._predictions = []
def process(self, inputs, outputs):
pred_logits = outputs["prediction_scores"]
labels = inputs["labels"]
# measure accuracy
topk_acc = accuracy(pred_logits, labels, topk=self.topk)
num_correct_acc_topk = [acc * labels.size(0) / 100 for acc in topk_acc]
self._predictions.append(
{"num_correct_topk": num_correct_acc_topk, "num_samples": labels.size(0)}
)
def evaluate(self):
if not dist.is_main_process():
return {}
else:
predictions = self._predictions
total_correct_num = OrderedDict()
for top_k in self.topk:
total_correct_num["Acc@" + str(top_k)] = 0
total_samples = 0
for prediction in predictions:
for top_k, num_correct_n in zip(self.topk, prediction["num_correct_topk"]):
total_correct_num["Acc@" + str(top_k)] += int(num_correct_n)
total_samples += int(prediction["num_samples"])
self._results = OrderedDict()
for top_k, topk_correct_num in total_correct_num.items():
self._results[top_k] = topk_correct_num / total_samples * 100
return copy.deepcopy(self._results)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import logging
import time
from collections import OrderedDict, abc
from contextlib import ExitStack, contextmanager
from typing import Callable, List, Union
import oneflow as flow
from libai.utils import distributed as dist
from libai.utils.logger import log_every_n_seconds
from .utils import pad_batch
# --------------------------------------------------------
# References:
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/evaluator.py
# --------------------------------------------------------
class DatasetEvaluator:
"""
Base class for a dataset evaluator.
The function :func:`inference_on_dataset` runs the model over
all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
This class will accumulate information of the inputs/outputs (by :meth:`process`),
and produce evaluation results in the end (by :meth:`evaluate`).
"""
def reset(self):
"""
Preparation for a new round of evaluation.
Should be called before starting a round of evaluation.
"""
def process(self, inputs, outputs):
"""
Process the pair of inputs and outputs.
.. code-block:: python
pred_logits = outputs["prediction_scores"]
labels = inputs["labels"]
# do evaluation on pred_logits/labels pair
...
Args:
inputs (dict): the inputs that's used to call the model.
outputs (dict): the return dict of `model(**inputs)`
"""
def evaluate(self):
"""
Evaluate/summarize the performance after processing all input/output pairs.
Returns:
dict:
A new evaluator class can return a dict of arbitrary format
as long as the user can process the results.
In our train_net.py, we expect the following format:
* key: the name of the task (e.g., Classification)
* value: a dict of {metric name: score}, e.g.: {"Acc@1": 75.0}
"""
class DatasetEvaluators(DatasetEvaluator):
"""
Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
This class dispatches every evaluation call to
all of its :class:`DatasetEvaluator`.
"""
def __init__(self, evaluators):
"""
Args:
evaluators (list): the evaluators to combine.
"""
super().__init__()
self._evaluators = evaluators
def reset(self):
for evaluator in self._evaluators:
evaluator.reset()
def process(self, inputs, outputs):
for evaluator in self._evaluators:
evaluator.process(inputs, outputs)
def evaluate(self):
results = OrderedDict()
for evaluator in self._evaluators:
result = evaluator.evaluate()
if dist.is_main_process() and result is not None:
for k, v in result.items():
assert (
k not in results
), "Different evaluators produce results with the same key {}".format(k)
results[k] = v
return results
def inference_on_dataset(
model,
data_loader,
batch_size,
eval_iter,
get_batch: Callable,
input_placement_device: str,
evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None],
):
"""
Run model on the data_loader and evaluate the metrics with evaluator.
Also benchmark the inference speed of `model.__call__` accurately.
The model will be used in eval mode.
Args:
model (callable): a callable which takes an object from
`data_loader` and returns some outputs.
If it's an nn.Module, it will be temporarily set to `eval` mode.
If you wish to evaluate a model in `training` mode instead, you can
wrap the given model and override its behavior of `.eval()` and `.train()`.
batch_size: batch size for inference
data_loader: an iterable object with a length.
The elements it generates will be the inputs to the model.
eval_iter: running steps for evaluation
get_batch: a Callable function for getting data from dataloader
input_placement_device: used in get_batch, set it to `cuda` or `cpu`.
see input_placement_device in `libai.configs.common.train.py` for more details.
evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
but don't want to do any evaluation.
Returns:
The return value of `evaluator.evaluate()`
"""
num_devices = dist.get_world_size()
logger = logging.getLogger(__name__)
total_samples = len(data_loader.dataset) # inference data loader must have a fixed length
if evaluator is None:
# create a no-op evaluator
evaluator = DatasetEvaluators([])
if isinstance(evaluator, abc.MutableSequence):
evaluator = DatasetEvaluators(evaluator)
evaluator.reset()
num_warmup = min(5, len(data_loader) - 1)
start_time = time.perf_counter()
total_data_time = 0
total_compute_time = 0
total_eval_time = 0
consumed_samples = 0
dps = dist.get_data_parallel_size()
last_batch_lack = (dps - (total_samples % dps)) % dps
# reset total samples
real_eval_iter = min(eval_iter, len(data_loader))
total_samples = min(real_eval_iter * batch_size, len(data_loader.dataset))
logger.info(
f"with eval_iter {eval_iter}, "
f"reset total samples {len(data_loader.dataset)} to {total_samples}"
)
logger.info(f"Start inference on {total_samples} samples")
with ExitStack() as stack:
if isinstance(model, (flow.nn.Module, flow.nn.Graph)):
stack.enter_context(inference_context(model))
stack.enter_context(flow.no_grad())
start_data_time = time.perf_counter()
for idx, inputs in enumerate(data_loader):
if idx >= real_eval_iter:
break
total_data_time += time.perf_counter() - start_data_time
if idx == num_warmup:
start_time = time.perf_counter()
total_data_time = 0
total_compute_time = 0
total_eval_time = 0
start_compute_time = time.perf_counter()
# model forward
data = get_batch(inputs, input_placement_device)
is_last_batch = idx == len(data_loader) - 1
paded_data, valid_sample = pad_batch(data, batch_size, last_batch_lack, is_last_batch)
outputs = model(**paded_data)
# get valid sample
valid_data = {
key: dist.tensor_to_rank0(value, to_local=True)[:valid_sample]
for key, value in data.items()
}
valid_outputs = {}
for key, value in outputs.items():
value = dist.tensor_to_rank0(value, to_local=True)
if value.ndim > 1:
valid_outputs[key] = value[:valid_sample] # Slice if it's batched output
else:
valid_outputs[key] = value
if flow.cuda.is_available():
dist.synchronize()
total_compute_time += time.perf_counter() - start_compute_time
start_eval_time = time.perf_counter()
if dist.is_main_process():
evaluator.process(valid_data, valid_outputs)
dist.synchronize()
total_eval_time += time.perf_counter() - start_eval_time
consumed_samples += valid_sample
iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
data_seconds_per_iter = total_data_time / iters_after_start
compute_seconds_per_iter = total_compute_time / iters_after_start
eval_seconds_per_iter = total_eval_time / iters_after_start
total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
eta = datetime.timedelta(
seconds=int(total_seconds_per_iter * (total_samples // batch_size - idx - 1))
)
log_every_n_seconds(
logging.INFO,
(
f"Inference done {consumed_samples}/{total_samples}. "
f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
f"Total: {total_seconds_per_iter:.4f} s/iter. "
f"ETA={eta}"
),
n=5,
)
start_data_time = time.perf_counter()
# Measure the time only for this worker (before the synchronization barrier)
total_time = time.perf_counter() - start_time
total_time_str = str(datetime.timedelta(seconds=total_time))
# NOTE this format is parsed by grep
logger.info("Total valid samples: {}".format(consumed_samples))
logger.info(
"Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
total_time_str, total_time / (total_samples - num_warmup), num_devices
)
)
total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
logger.info(
"Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
total_compute_time_str,
total_compute_time / (total_samples - num_warmup),
num_devices,
)
)
results = evaluator.evaluate()
# An evaluator may return None when not in main process.
# Replace it by an empty dict instead to make it easier for downstream code to handle
if results is None:
results = {}
return results
@contextmanager
def inference_context(model):
"""
A context where the model is temporarily changed to eval mode,
and restored to previous mode afterwards.
Args:
model: eager or graph mode in oneflow
"""
training_mode = model.model.training if isinstance(model, flow.nn.Graph) else model.training
if isinstance(model, flow.nn.Graph):
model.model.eval()
else:
model.eval()
yield
if isinstance(model, flow.nn.Graph):
model.model.train(training_mode)
else:
model.train(training_mode)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import math
from collections import OrderedDict
from libai.utils import distributed as dist
from .evaluator import DatasetEvaluator
class PPLEvaluator(DatasetEvaluator):
"""
Evaluate perplexity for Language Model.
Perplexity is a measurement of how well a probability distribution or
probability model predicts a sample.
"""
def __init__(self):
self._predictions = []
def reset(self):
self._predictions = []
def process(self, inputs, outputs):
for k, v in outputs.items():
ppl = math.exp(min(20, v.item()))
self._predictions.append({f"{k}_PPL": ppl})
def evaluate(self):
if not dist.is_main_process():
return {}
else:
predictions = self._predictions
self._results = OrderedDict()
for prediction in predictions:
for k, v in prediction.items():
if k not in self._results:
self._results[k] = 0
self._results[k] += v
for k in self._results.keys():
self._results[k] /= len(predictions)
return copy.deepcopy(self._results)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import logging
from collections import OrderedDict
import numpy as np
from scipy.stats import pearsonr, spearmanr
from libai.utils import distributed as dist
from .evaluator import DatasetEvaluator
logger = logging.getLogger(__name__)
class RegEvaluator(DatasetEvaluator):
def __init__(self):
self._predictions = []
def reset(self):
self._predictions = []
def process(self, inputs, outputs):
pred_logits = outputs["prediction_scores"]
labels = inputs["labels"]
# measure accuracy
preds = pred_logits.cpu().topk(1)[1].squeeze(1).numpy()
labels = labels.cpu().numpy()
self._predictions.append({"preds": preds, "labels": labels})
def evaluate(self):
if not dist.is_main_process():
return {}
else:
predictions = self._predictions
preds = np.array([])
labels = np.array([])
for prediction in predictions:
preds = np.concatenate((preds, prediction["preds"]))
labels = np.concatenate((labels, prediction["labels"]))
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
corr = (pearson_corr + spearman_corr) / 2
self._results = OrderedDict()
self._results["pearson"] = pearson_corr
self._results["spearman"] = spearman_corr
self._results["corr"] = corr
return copy.deepcopy(self._results)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from collections.abc import Mapping
import oneflow as flow
from libai.utils import distributed as dist
def pad_batch(x_dict, batch_size, last_batch_lack, is_last_batch):
x = list(x_dict.values())[0]
tensor_batch = x.shape[0]
assert tensor_batch <= batch_size
if tensor_batch == batch_size and not is_last_batch:
return x_dict, batch_size
valid_sample = tensor_batch - last_batch_lack
data_parallel_size = dist.get_data_parallel_size()
assert tensor_batch % data_parallel_size == 0
tensor_micro_batch_size = tensor_batch // data_parallel_size
padded_dict = {}
for key, xi in x_dict.items():
pad_shape = (batch_size, *xi.shape[1:])
local_xi = xi.to_global(
sbp=flow.sbp.broadcast, placement=flow.env.all_device_placement("cuda")
).to_local()
padded_xi = flow.zeros(pad_shape, dtype=xi.dtype, device="cuda")
padded_xi[:tensor_batch, ...] = padded_xi[:tensor_batch, ...] + local_xi
for i in range(last_batch_lack - 1):
start_idx = tensor_micro_batch_size * (data_parallel_size - i - 1) - 1
padded_xi[start_idx:-1] = padded_xi[start_idx + 1 :]
padded_xi = padded_xi.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=xi.placement
).to_global(sbp=xi.sbp)
padded_dict[key] = padded_xi
return padded_dict, valid_sample
def print_csv_format(results):
"""
Print main metrics in a particular format
so that they are easy to copypaste into a spreadsheet.
Args:
results (OrderedDict[dict]): task_name -> {metric -> score}
unordered dict can also be printed, but in arbitrary order
"""
assert isinstance(results, Mapping) or not len(results), results
logger = logging.getLogger(__name__)
for task, res in results.items():
if isinstance(res, Mapping):
# Don't print "AP-category" metrics since they are usually not tracked.
important_res = [(k, v) for k, v in res.items() if "-" not in k]
logger.info("copypaste: Task: {}".format(task))
logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
else:
logger.info(f"copypaste: {task}={res}")
def flatten_results_dict(results):
"""
Expand a hierarchical dict of scalars into a flat dict of scalars.
If results[k1][k2][k3] = v, the returned dict will have the entry
{"k1/k2/k3": v}.
Args:
results (dict):
"""
r = {}
for k, v in results.items():
if isinstance(v, Mapping):
v = flatten_results_dict(v)
for kk, vv in v.items():
r[k + "/" + kk] = vv
else:
r[k] = v
return r
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from abc import ABCMeta, abstractmethod
from typing import Any, Dict
import oneflow as flow
from libai.config import LazyConfig, try_get_key
from libai.engine import DefaultTrainer
from libai.utils import distributed as dist
from libai.utils.logger import setup_logger
logger = setup_logger(distributed_rank=dist.get_rank())
logger = logging.getLogger("libai.inference")
class BasePipeline(metaclass=ABCMeta):
"""
Base class for all task pipeline
"""
def __init__(
self,
config_file,
data_parallel=None,
tensor_parallel=None,
pipeline_parallel=None,
pipeline_stage_id=None,
pipeline_num_layers=None,
model_path=None,
mode="libai",
**kwargs,
):
# init cfg
self.cfg = LazyConfig.load(config_file)
flow.boxing.nccl.set_fusion_threshold_mbytes(
try_get_key(self.cfg, "train.nccl_fusion_threshold_mb", default=16)
)
flow.boxing.nccl.set_fusion_max_ops_num(
try_get_key(self.cfg, "train.nccl_fusion_max_ops", default=24)
)
self.update_cfg(
data_parallel,
tensor_parallel,
pipeline_parallel,
pipeline_stage_id,
pipeline_num_layers,
)
dist.setup_dist_util(self.cfg.train.dist)
assert (
self.cfg.train.dist.data_parallel_size == 1
), "not support data parallel yet, only support tensor and pipeline parallel"
logger.info(self.cfg.train.dist)
# initial and load model
self.model = self.load_pretrain_weight(self.cfg.model, model_path, mode=mode)
self.model._apply(dist.convert_to_distributed_default_setting)
self.model = self.model.eval()
# initial tokenizer
if dist.is_main_process():
self.tokenizer = self.build_tokenizer(self.cfg)
else:
self.tokenizer = None
self.tokenizer = dist.broadcast_py_object(self.tokenizer, src=0)
# set parameters
(
self._preprocess_params,
self._forward_params,
self._postprocess_params,
) = self._parse_parameters(**kwargs)
def update_cfg(
self,
data_parallel=1,
tensor_parallel=1,
pipeline_parallel=1,
pipeline_stage_id=None,
pipeline_num_layers=None,
):
self.cfg.train.dist.data_parallel_size = data_parallel
self.cfg.train.dist.tensor_parallel_size = tensor_parallel
self.cfg.train.dist.pipeline_parallel_size = pipeline_parallel
self.cfg.train.dist.custom_pipeline_stage_id = pipeline_stage_id
if pipeline_num_layers is not None:
self.cfg.train.dist.pipeline_num_layers = pipeline_num_layers
if self.cfg.train.dist.pipeline_parallel_size > 1:
assert (
try_get_key(self.cfg.train.dist, "pipeline_num_layers") is not None
), "cfg.train.dist.pipeline_num_layers must be set when run pipeline parallel"
def load_pretrain_weight(
self,
libai_cfg_model,
model_path,
mode="libai",
):
"""load pretrained model.
Args:
libai_cfg_model (libai.models): Lazy config Model in Libai, you can import it
by `from libai.config.configs.common.models.bert
import pretrain_model as libai_cfg_model`
model_path (str): The directory path of pretrained model
mode (str): set it to `libai` for loading trained model from libai,
set it to `random` for quickly debugging by random initialized model
"""
if mode == "libai":
from libai.models.utils.model_loader.base_loader import ModelLoaderLiBai
model_loader = ModelLoaderLiBai(libai_cfg_model, libai_cfg_model.cfg, model_path)
model_loader.base_model_prefix_1 = None
model_loader.base_model_prefix_2 = ""
return model_loader.load()
elif mode == "random":
return DefaultTrainer.build_model(self.cfg)
else:
raise NotImplementedError
def build_tokenizer(self, cfg):
tokenizer = None
if try_get_key(cfg, "tokenization") is not None:
tokenizer = DefaultTrainer.build_tokenizer(cfg)
return tokenizer
@abstractmethod
def _parse_parameters(self, **pipeline_parameters):
raise NotImplementedError("_parse_parameters not implemented")
def __call__(self, inputs, *args, batch_size=None, **kwargs) -> dict:
preprocess_params, forward_params, postprocess_params = self._parse_parameters(
**kwargs
) # noqa
# Fuse __init__ params and __call__ params without modifying the __init__ ones.
preprocess_params = {**self._preprocess_params, **preprocess_params}
forward_params = {**self._forward_params, **forward_params}
postprocess_params = {**self._postprocess_params, **postprocess_params}
with flow.no_grad():
model_inputs_dict = self.preprocess(inputs, **preprocess_params)
model_outputs_dict = self.forward(model_inputs_dict, **forward_params)
model_outputs_dict = self.to_local(model_outputs_dict)
if dist.is_main_process():
outputs_dict = self.postprocess(model_outputs_dict, **postprocess_params)
else:
outputs_dict = {}
dist.synchronize()
return outputs_dict
def to_local(self, model_outputs_dict):
for key, value in model_outputs_dict.items():
if isinstance(value, flow.Tensor) and value.is_global:
model_outputs_dict[key] = dist.ttol(
value, ranks=[0] if value.placement.ranks.ndim == 1 else [[0]]
)
if flow.cuda.is_available():
dist.synchronize()
return model_outputs_dict
@abstractmethod
def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> dict:
raise NotImplementedError("preprocess not implemented")
@abstractmethod
def forward(self, **kwargs: Dict) -> dict:
raise NotImplementedError("forward not implemented")
@abstractmethod
def postprocess(self, **kwargs: Dict) -> dict:
raise NotImplementedError("postprocess not implemented")
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and
# The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from abc import ABC, abstractmethod
from collections import UserDict
from typing import Optional, Tuple
import oneflow as flow
from libai.utils import distributed as dist
class BeamScorer(ABC):
@abstractmethod
def process(
self,
input_ids: flow.Tensor,
next_scores: flow.Tensor,
next_tokens: flow.Tensor,
next_indices: flow.Tensor,
**kwargs,
):
raise NotImplementedError("This is an abstract method.")
class BeamHypotheses:
def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool):
"""
Initialize n-best list of hypotheses.
"""
self.length_penalty = length_penalty
self.early_stopping = early_stopping
self.num_beams = num_beams
self.beams = []
self.worst_score = 1e9
def __len__(self) -> int:
"""
Number of hypotheses in the list.
"""
return len(self.beams)
def add(
self, hyp: flow.Tensor, sum_logprobs: float, beam_indices: Optional[flow.Tensor] = None
):
"""
Add a new hypothesis to the list.
"""
score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
if len(self) < self.num_beams or score > self.worst_score:
self.beams.append((score, hyp, beam_indices))
if len(self) > self.num_beams:
sorted_next_scores = sorted([(s, idx) for idx, (s, _, _) in enumerate(self.beams)])
del self.beams[sorted_next_scores[0][1]]
self.worst_score = sorted_next_scores[1][0]
else:
self.worst_score = min(score, self.worst_score)
def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
"""
If there are enough hypotheses and that none of the hypotheses being generated
can become better than the worst one in the heap, then we are done with this sentence.
"""
if len(self) < self.num_beams:
return False
elif self.early_stopping:
return True
else:
cur_score = best_sum_logprobs / cur_len ** self.length_penalty
ret = self.worst_score >= cur_score
return ret
class BeamSearchScorer(BeamScorer):
def __init__(
self,
batch_size: int,
num_beams: int,
length_penalty: Optional[float] = 1.0,
do_early_stopping: Optional[bool] = False,
num_beam_hyps_to_keep: Optional[int] = 1,
num_beam_groups: Optional[int] = 1,
**kwargs,
):
self.num_beams = num_beams
self.length_penalty = length_penalty
self.do_early_stopping = do_early_stopping
self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
self.num_beam_groups = num_beam_groups
self.group_size = self.num_beams // self.num_beam_groups
self._is_init = False
self._beam_hyps = [
BeamHypotheses(
num_beams=self.num_beams,
length_penalty=self.length_penalty,
early_stopping=self.do_early_stopping,
)
for _ in range(batch_size)
]
self._done = flow.tensor(
[False for _ in range(batch_size)],
dtype=flow.bool,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
if not isinstance(num_beams, int) or num_beams <= 1:
raise ValueError(
f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}."
"For `num_beams` == 1, one should make use of `greedy_search` instead."
)
if (
not isinstance(num_beam_groups, int)
or (num_beam_groups > num_beams)
or (num_beams % num_beam_groups != 0)
):
raise ValueError(
"`num_beam_groups` has to be an integer smaller or equal than `num_beams` and "
f"`num_beams` has to be divisible by `num_beam_groups`, but is {num_beam_groups}"
f"with `num_beams` being {num_beams}."
)
if "max_length" in kwargs:
warnings.warn(
"Passing `max_length` to BeamSearchScorer is deprecated and has no effect. "
"`max_length` should be passed directly to `beam_search(...)`, `beam_sample(...)`"
", or `group_beam_search(...)`."
)
@property
def is_done(self) -> bool:
return self._done.all()
def process(
self,
input_ids: flow.Tensor,
next_scores: flow.Tensor,
next_tokens: flow.Tensor,
next_indices: flow.Tensor,
pad_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None,
beam_indices: Optional[flow.Tensor] = None,
) -> Tuple[flow.Tensor]:
cur_len = input_ids.shape[-1]
batch_size = len(self._beam_hyps)
if not (batch_size == (input_ids.shape[0] // self.group_size)):
if self.num_beam_groups > 1:
raise ValueError(
f"A group beam size of {input_ids.shape[0]} is used as the input, but a group "
f"beam size of {self.group_size} is expected by the beam scorer."
)
else:
raise ValueError(
f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
f"{self.group_size} is expected by the beam scorer."
)
next_beam_scores = flow.zeros(
(batch_size, self.group_size),
dtype=next_scores.dtype,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
next_beam_tokens = flow.zeros(
(batch_size, self.group_size),
dtype=next_tokens.dtype,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
next_beam_indices = flow.zeros(
(batch_size, self.group_size),
dtype=next_indices.dtype,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
for batch_idx, beam_hyp in enumerate(self._beam_hyps):
if self._done[batch_idx]:
if self.num_beams < len(beam_hyp):
raise ValueError(
f"Batch can only be done if at least {self.num_beams} beams have "
"been generated"
)
if eos_token_id is None or pad_token_id is None:
raise ValueError(
"Generated beams >= num_beams -> eos_token_id and pad_token have "
"to be defined"
)
# pad the batch
next_beam_scores[batch_idx, :] = 0
next_beam_tokens[batch_idx, :] = pad_token_id
next_beam_indices[batch_idx, :] = 0
continue
# next tokens for this sentence
beam_idx = 0
for beam_token_rank, (next_token, next_score, next_index) in enumerate(
zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
):
batch_beam_idx = batch_idx * self.group_size + next_index
# add to generated hypotheses if end of sentence
if (eos_token_id is not None) and (next_token.item() == eos_token_id):
# if beam_token does not belong to top num_beams tokens, it should not be added
is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
if is_beam_token_worse_than_top_num_beams:
continue
if beam_indices is not None:
beam_index = beam_indices[batch_beam_idx]
beam_index = beam_index + (next_index,)
else:
beam_index = None
beam_hyp.add(
input_ids[batch_beam_idx].clone(),
next_score.item(),
beam_indices=beam_index,
)
else:
# add next predicted token since it is not eos_token
next_beam_scores[batch_idx, beam_idx] = next_score
next_beam_tokens[batch_idx, beam_idx] = next_token
next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
beam_idx += 1
# once the beam for next step is full, don't add more tokens to it.
if beam_idx == self.group_size:
break
if beam_idx < self.group_size:
raise ValueError(
f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal "
f"to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} "
"are corrected."
)
# Check if we are done so that we can save a pad step if all(done)
self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
next_scores[batch_idx].max().item(), cur_len
)
return UserDict(
{
"next_beam_scores": next_beam_scores.view(-1),
"next_beam_tokens": next_beam_tokens.view(-1),
"next_beam_indices": next_beam_indices.view(-1),
}
)
def finalize(
self,
input_ids: flow.Tensor,
final_beam_scores: flow.Tensor,
final_beam_tokens: flow.Tensor,
final_beam_indices: flow.Tensor,
max_length: int,
pad_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None,
beam_indices: Optional[flow.Tensor] = None,
):
batch_size = len(self._beam_hyps)
# finalize all open beam hypotheses and add to generated hypotheses
for batch_idx, beam_hyp in enumerate(self._beam_hyps):
if self._done[batch_idx]:
continue
# all open beam hypotheses are added to the beam hypothesis
# beam hypothesis class automatically keeps the best beams
for beam_id in range(self.num_beams):
batch_beam_idx = batch_idx * self.num_beams + beam_id
final_score = final_beam_scores[batch_beam_idx].item()
final_tokens = input_ids[batch_beam_idx]
beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
beam_hyp.add(final_tokens, final_score, beam_indices=beam_index)
# select the best hypotheses
sent_lengths = flow.zeros(
batch_size * self.num_beam_hyps_to_keep,
dtype=flow.long,
sbp=input_ids.sbp,
placement=input_ids.placement,
)
best = []
best_indices = []
best_scores = flow.zeros(
batch_size * self.num_beam_hyps_to_keep,
dtype=flow.float32,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
# retrieve best hypotheses
for i, beam_hyp in enumerate(self._beam_hyps):
sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
for j in range(self.num_beam_hyps_to_keep):
best_hyp_tuple = sorted_hyps.pop()
best_score = best_hyp_tuple[0]
best_hyp = best_hyp_tuple[1]
best_index = best_hyp_tuple[2]
sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
# append hyp to lists
best.append(best_hyp)
# append indices to list
best_indices.append(best_index)
best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
# prepare for adding eos
sent_lengths_max = sent_lengths.max().item() + 1
sent_max_len = (
min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
)
decoded = flow.zeros(
(batch_size * self.num_beam_hyps_to_keep, sent_max_len),
dtype=flow.long,
sbp=input_ids.sbp,
placement=input_ids.placement,
)
if len(best_indices) > 0 and best_indices[0] is not None:
indices = flow.zeros(
(batch_size * self.num_beam_hyps_to_keep, sent_max_len),
dtype=flow.long,
sbp=input_ids.sbp,
placement=input_ids.placement,
)
else:
indices = None
# shorter batches are padded if needed
if sent_lengths.min().item() != sent_lengths.max().item():
assert pad_token_id is not None, "`pad_token_id` has to be defined"
decoded.fill_(pad_token_id)
if indices is not None:
indices.fill_(-1)
# fill with hypotheses and eos_token_id if the latter fits in
for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
decoded[i, : sent_lengths[i]] = hypo
if indices is not None:
indices[i, : len(best_idx)] = flow.tensor(best_idx)
if sent_lengths[i] < sent_max_len:
decoded[i, sent_lengths[i]] = eos_token_id
return UserDict(
{
"sequences": decoded,
"sequence_scores": best_scores,
"beam_indices": indices,
}
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and
# The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import math
from typing import Callable, List, Tuple
import oneflow as flow
class LogitsProcessorList(list):
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor, **kwargs) -> flow.Tensor:
for processor in self:
function_args = inspect.signature(processor.__call__).parameters
if len(function_args) > 2:
if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
raise ValueError(
f"Make sure that all the required parameters: {list(function_args.keys())} "
"for {processor.__class__} are passed to the logits processor."
)
scores = processor(input_ids, scores, **kwargs)
else:
scores = processor(input_ids, scores)
return scores
class NormalizationLogitsProcessor(object):
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
scores = scores.log_softmax(dim=-1)
return scores
class InfNanRemoveLogitsProcessor(object):
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
scores[scores != scores] = 0.0
scores[scores == float("inf")] = flow.finfo(scores.dtype).max
return scores
class ForcedEOSTokenLogitsProcessor(object):
def __init__(self, max_length: int, eos_token_id: int):
self.max_length = max_length
self.eos_token_id = eos_token_id
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
cur_len = input_ids.shape[-1]
if cur_len == self.max_length - 1:
num_tokens = scores.shape[1]
scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] = -float("inf")
scores[:, self.eos_token_id] = 0
return scores
class ForcedBOSTokenLogitsProcessor(object):
def __init__(self, bos_token_id: int):
self.bos_token_id = bos_token_id
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
cur_len = input_ids.shape[-1]
if cur_len == 1:
num_tokens = scores.shape[1]
scores[:, [i for i in range(num_tokens) if i != self.bos_token_id]] = -float("inf")
scores[:, self.bos_token_id] = 0
return scores
class RepetitionPenaltyLogitsProcessor(object):
def __init__(self, penalty: float):
if not isinstance(penalty, float) or not (penalty > 0):
raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
self.penalty = penalty
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
score = flow.gather(scores, 1, input_ids)
score = flow.where(score < 0, score * self.penalty, score / self.penalty)
scores = flow.scatter(scores, 1, input_ids, score)
return scores
class HammingDiversityLogitsProcessor(object):
def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0):
raise ValueError("`diversity_penalty` should be a float strictly larger than 0.")
self._diversity_penalty = diversity_penalty
if not isinstance(num_beams, int) or num_beams < 2:
raise ValueError("`num_beams` should be an integer strictly larger than 1.")
self._num_beams = num_beams
if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
if num_beam_groups > num_beams:
raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.")
self._num_sub_beams = num_beams // num_beam_groups
def __call__(self, input_ids, scores, current_tokens, beam_group_idx) -> flow.Tensor:
scores = scores.numpy()
batch_size = current_tokens.shape[0] // self._num_beams
group_start_idx = beam_group_idx * self._num_sub_beams
group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
group_size = group_end_idx - group_start_idx
vocab_size = scores.shape[-1]
if group_start_idx == 0:
return scores
for batch_idx in range(batch_size):
# predicted tokens of last time step of previous groups
previous_group_tokens = current_tokens[
batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
]
token_frequency = flow.bincount(previous_group_tokens, minlength=vocab_size)
scores[batch_idx * group_size : (batch_idx + 1) * group_size] = (
scores[batch_idx * group_size : (batch_idx + 1) * group_size]
- self._diversity_penalty * token_frequency
)
return scores
def _get_ngrams(ngram_size: int, prev_input_ids: flow.Tensor, num_hypos: int):
generated_ngrams = [{} for _ in range(num_hypos)]
for idx in range(num_hypos):
gen_tokens = prev_input_ids[idx].tolist()
generated_ngram = generated_ngrams[idx]
for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]):
prev_ngram_tuple = tuple(ngram[:-1])
generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [
ngram[-1]
]
return generated_ngrams
def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
start_idx = cur_len + 1 - ngram_size
ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist())
return banned_ngrams.get(ngram_idx, [])
def _calc_banned_ngram_tokens(
ngram_size: int, prev_input_ids: flow.Tensor, num_hypos: int, cur_len: int
):
if cur_len + 1 < ngram_size:
return [[] for _ in range(num_hypos)]
generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos)
banned_tokens = [
_get_generated_ngrams(
generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len
)
for hypo_idx in range(num_hypos)
]
return banned_tokens
class NoRepeatNGramLogitsProcessor(object):
def __init__(self, ngram_size: int):
if not isinstance(ngram_size, int) or ngram_size <= 0:
raise ValueError(
f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}"
)
self.ngram_size = ngram_size
def __call__(self, input_ids, scores) -> flow.Tensor:
num_batch_hypotheses = scores.shape[0]
cur_len = input_ids.shape[-1]
banned_batch_tokens = _calc_banned_ngram_tokens(
self.ngram_size, input_ids, num_batch_hypotheses, cur_len
)
for i, banned_tokens in enumerate(banned_batch_tokens):
scores[i, banned_tokens] = -float("inf")
return scores
class EncoderNoRepeatNGramLogitsProcessor(object):
def __init__(self, encoder_ngram_size: int, encoder_input_ids: flow.Tensor):
if not isinstance(encoder_ngram_size, int) or encoder_ngram_size <= 0:
raise ValueError(
"`encoder_ngram_size` has to be a strictly positive integer, but is "
f"{encoder_ngram_size}"
)
self.ngram_size = encoder_ngram_size
if len(encoder_input_ids.shape) == 1:
encoder_input_ids = encoder_input_ids.unsqueeze(0)
self.batch_size = encoder_input_ids.shape[0]
self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size)
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
# B x num_beams
num_hypos = scores.shape[0]
num_beams = num_hypos // self.batch_size
cur_len = input_ids.shape[-1]
banned_batch_tokens = [
_get_generated_ngrams(
self.generated_ngrams[hypo_idx // num_beams],
input_ids[hypo_idx],
self.ngram_size,
cur_len,
)
for hypo_idx in range(num_hypos)
]
for i, banned_tokens in enumerate(banned_batch_tokens):
scores[i, banned_tokens] = -float("inf")
return scores
class MinLengthLogitsProcessor(object):
def __init__(self, min_length: int, eos_token_id: int):
if not isinstance(min_length, int) or min_length < 0:
raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
if not isinstance(eos_token_id, int) or eos_token_id < 0:
raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
self.min_length = min_length
self.eos_token_id = eos_token_id
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
cur_len = input_ids.shape[-1]
if cur_len < self.min_length:
scores[:, self.eos_token_id] = -float("inf")
return scores
class PrefixConstrainedLogitsProcessor(object):
def __init__(
self, prefix_allowed_tokens_fn: Callable[[int, flow.Tensor], List[int]], num_beams: int
):
self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
self._num_beams = num_beams
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
mask = flow.full_like(scores, -math.inf)
for batch_id, beam_sent in enumerate(
input_ids.view(-1, self._num_beams, input_ids.shape[-1])
):
for beam_id, sent in enumerate(beam_sent):
mask[
batch_id * self._num_beams + beam_id,
self._prefix_allowed_tokens_fn(batch_id, sent),
] = 0
return scores + mask
class ExponentialDecayLengthPenalty(object):
def __init__(
self, exponential_decay_length_penalty: Tuple, eos_token_id: int, input_ids_seq_length: int
):
self.regulation_start = exponential_decay_length_penalty[0] + input_ids_seq_length
self.regulation_factor = exponential_decay_length_penalty[1]
self.eos_token_id = eos_token_id
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
cur_len = input_ids.shape[-1]
if cur_len > self.regulation_start:
scores[:, self.eos_token_id] = scores[:, self.eos_token_id] * pow(
self.regulation_factor, cur_len - self.regulation_start
)
return scores
class TemperatureLogitsWarper(object):
def __init__(self, temperature: float):
if not isinstance(temperature, float) or not (temperature > 0):
raise ValueError(
f"`temperature` has to be a strictly positive float, but is {temperature}"
)
self.temperature = temperature
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
scores = scores / self.temperature
return scores
class TopPLogitsWarper(object):
def __init__(
self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1
):
top_p = float(top_p)
if top_p < 0 or top_p > 1.0:
raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
self.top_p = top_p
self.filter_value = filter_value
self.min_tokens_to_keep = min_tokens_to_keep
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
sorted_logits, sorted_indices = flow.sort(scores, descending=True)
cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
# Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
sorted_indices_to_remove = cumulative_probs > self.top_p
if self.min_tokens_to_keep > 1:
# Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1
# because we add the first one below)
sorted_indices_to_remove[..., : self.min_tokens_to_keep - 1] = 0
# Shift the indices to the right to keep also the first token above the threshold
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
# scatter sorted tensors to original indexing
indices_to_remove = flow.scatter(
sorted_indices_to_remove, 1, sorted_indices, sorted_indices_to_remove
)
scores = scores.masked_fill(indices_to_remove, self.filter_value)
return scores
class TopKLogitsWarper(object):
def __init__(
self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1
):
if not isinstance(top_k, int) or top_k <= 0:
raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
self.top_k = top_k
self.filter_value = filter_value
self.min_tokens_to_keep = min_tokens_to_keep
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
top_k = min(max(self.top_k, self.min_tokens_to_keep), scores.size(-1)) # Safety check
# Remove all tokens with a probability less than the last token of the top-k
indices_to_remove = scores < flow.topk(scores, top_k)[0][..., -1, None]
scores = scores.masked_fill(indices_to_remove, self.filter_value)
return scores
class TypicalLogitsWarper(object):
def __init__(
self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1
):
mass = float(mass)
if not (mass > 0 and mass < 1):
raise ValueError(f"`typical_p` has to be a float > 0 and < 1, but is {mass}")
self.filter_value = filter_value
self.mass = mass
self.min_tokens_to_keep = min_tokens_to_keep
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
# calculate entropy
normalized = flow.nn.functional.log_softmax(scores, dim=-1)
p = flow.exp(normalized)
ent = -flow.nansum(normalized * p, dim=-1, keepdim=True)
# shift and sort
shifted_scores = flow.abs((-normalized) - ent)
sorted_scores, sorted_indices = flow.sort(shifted_scores, descending=False)
sorted_logits = scores.gather(-1, sorted_indices)
cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
# Remove tokens with cumulative mass above the threshold
last_ind = (cumulative_probs < self.mass).sum(dim=1)
last_ind[last_ind < 0] = 0
sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
if self.min_tokens_to_keep > 1:
# Keep at least min_tokens_to_keep
# (set to min_tokens_to_keep-1 because we add the first one below)
sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
indices_to_remove = flow.scatter(
sorted_indices_to_remove, 1, sorted_indices, sorted_indices_to_remove
)
scores = scores.masked_fill(indices_to_remove, self.filter_value)
return scores
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and
# The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import warnings
from copy import deepcopy
import oneflow as flow
class StoppingCriteriaList(list):
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor, **kwargs) -> bool:
return any(criteria(input_ids, scores) for criteria in self)
@property
def max_length(self):
for stopping_criterium in self:
if isinstance(stopping_criterium, MaxLengthCriteria):
return stopping_criterium.max_length
return None
class MaxLengthCriteria(object):
def __init__(self, max_length: int):
self.max_length = max_length
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> bool:
return input_ids.shape[-1] >= self.max_length
class MaxTimeCriteria(object):
def __init__(self, max_time: float, initial_timestamp: float = None):
self.max_time = max_time
self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp
def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor, **kwargs) -> bool:
return time.time() - self.initial_timestamp > self.max_time
def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int):
stopping_max_length = stopping_criteria.max_length
new_stopping_criteria = deepcopy(stopping_criteria)
if stopping_max_length is not None and stopping_max_length != max_length:
warnings.warn(
"You set different `max_length` for stopping criteria and `max_length` parameter",
UserWarning,
)
elif stopping_max_length is None:
new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
return new_stopping_criteria
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and
# The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import logging
import warnings
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
import oneflow as flow
from oneflow import nn
from libai.utils import distributed as dist
from .generation_beam_search import BeamScorer, BeamSearchScorer
from .generation_logits_processor import (
EncoderNoRepeatNGramLogitsProcessor,
ExponentialDecayLengthPenalty,
ForcedBOSTokenLogitsProcessor,
ForcedEOSTokenLogitsProcessor,
HammingDiversityLogitsProcessor,
InfNanRemoveLogitsProcessor,
LogitsProcessorList,
MinLengthLogitsProcessor,
NoRepeatNGramLogitsProcessor,
NormalizationLogitsProcessor,
PrefixConstrainedLogitsProcessor,
RepetitionPenaltyLogitsProcessor,
TemperatureLogitsWarper,
TopKLogitsWarper,
TopPLogitsWarper,
TypicalLogitsWarper,
)
from .generation_stopping_criteria import (
MaxLengthCriteria,
MaxTimeCriteria,
StoppingCriteriaList,
validate_stopping_criteria,
)
logger = logging.getLogger(__name__)
class Generator:
def _prepare_model_inputs(
self,
inputs: Optional[flow.Tensor] = None,
bos_token_id: Optional[int] = None,
model_kwargs: Optional[Dict[str, flow.Tensor]] = None,
):
if self.cfg.is_encoder_decoder:
input_name = "encoder_input_ids"
else:
input_name = "input_ids"
model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None or k != input_name}
inputs_kwarg = model_kwargs.pop(input_name, None)
if inputs_kwarg is not None and inputs is not None:
raise ValueError(
f"`inputs`: {inputs}` were passed alongside "
f"{input_name} which is not allowed."
f"Make sure to either pass {inputs} or {input_name}=..."
)
elif inputs_kwarg is not None:
inputs = inputs_kwarg
if inputs is None:
inputs = self._prepare_input_ids_for_generation(
bos_token_id, model_kwargs.get("encoder_outputs", None)
)
return inputs, input_name, model_kwargs
def prepare_inputs_for_generation(self, input_ids: flow.Tensor, **kwargs):
"""
Implement in subclasses of [`PreTrainedModel`] for custom behavior to prepare inputs in the
generate method.
"""
return {"input_ids": input_ids}
def _prepare_input_ids_for_generation(
self, bos_token_id: Optional[int], encoder_outputs: Optional[flow.Tensor]
):
if self.cfg.is_encoder_decoder and encoder_outputs is not None:
shape = encoder_outputs.size()[:-1]
return (
flow.ones(
shape,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
* -100
)
if bos_token_id is None:
raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
return (
flow.ones(
(1, 1),
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
* bos_token_id
)
def _prepare_attention_mask_for_generation(
self,
inputs: flow.Tensor,
pad_token_id: Optional[int],
eos_token_id: Optional[int],
):
is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [flow.int64, flow.long]
is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs)
is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
(eos_token_id is not None) and (pad_token_id != eos_token_id)
)
# Check if input is input_ids and padded -> only then is attention_mask defined
if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
return inputs.ne(pad_token_id).bool()
else:
return flow.ones(
inputs.shape[:2],
dtype=flow.bool,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
def _prepare_encoder_decoder_kwargs_for_generation(
self, inputs_tensor: flow.Tensor, model_kwargs, model_input_name: str
):
only_encoder = True
model_kwargs[model_input_name] = inputs_tensor
if "encoder_decoder_attn_mask" in set(inspect.signature(self.forward).parameters):
model_kwargs["encoder_decoder_attn_mask"] = model_kwargs["encoder_attn_mask"]
model_kwargs["encoder_outputs"] = self(**model_kwargs, only_encoder=only_encoder)
model_kwargs.pop(model_input_name)
return model_kwargs
def _prepare_decoder_input_ids_for_generation(
self,
batch_size: int,
decoder_start_token_id: int = None,
bos_token_id: int = None,
model_kwargs=None,
):
if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
return model_kwargs.pop("decoder_input_ids")
else:
decoder_start_token_id = (
decoder_start_token_id
if decoder_start_token_id
else self.cfg.decoder_start_token_id
)
return (
flow.ones(
(batch_size, 1),
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
* decoder_start_token_id
)
def _get_decoder_start_token_id(
self, decoder_start_token_id: int = None, bos_token_id: int = None
):
if decoder_start_token_id is not None:
return decoder_start_token_id
elif self.cfg.is_encoder_decoder:
return self.cfg.decoder_start_token_id
elif bos_token_id is not None:
return bos_token_id
else:
return self.cfg.bos_token_idx
@staticmethod
def _expand_inputs_for_generation(
input_ids: flow.Tensor,
expand_size: int = 1,
is_encoder_decoder: bool = False,
attention_mask: Optional[flow.Tensor] = None,
encoder_outputs: Optional[flow.Tensor] = None,
**model_kwargs,
):
expanded_return_idx = (
flow.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1)
)
expanded_return_idx = expanded_return_idx.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
input_ids = input_ids.index_select(0, expanded_return_idx)
# token_type ids not supported.
if attention_mask is not None:
model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
if is_encoder_decoder:
if encoder_outputs is None:
raise ValueError(
"If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
)
encoder_outputs = encoder_outputs.to_global(placement=expanded_return_idx.placement)
encoder_outputs = encoder_outputs.index_select(0, expanded_return_idx)
model_kwargs["encoder_outputs"] = encoder_outputs
model_kwargs["encoder_attn_mask"] = model_kwargs["encoder_attn_mask"].index_select(
0, expanded_return_idx
)
model_kwargs["encoder_decoder_attn_mask"] = model_kwargs["encoder_attn_mask"]
return input_ids, model_kwargs
def _update_model_kwargs_for_generation(
self, outputs, model_kwargs, is_encoder_decoder: bool = False
):
if "past_key_values" in outputs:
model_kwargs["past"] = outputs["past_key_values"]
elif "mems" in outputs:
model_kwargs["past"] = outputs["mems"]
elif "past_buckets_states" in outputs:
model_kwargs["past"] = outputs["past_buckets_states"]
elif self.past_key_values[-1] is not None:
model_kwargs["past"] = self.past_key_values
else:
model_kwargs["past"] = None
# update attention mask
if "attention_mask" in model_kwargs and not is_encoder_decoder:
attention_mask = model_kwargs["attention_mask"]
pad = flow.ones(
(attention_mask.shape[0], 1),
sbp=attention_mask.sbp,
placement=attention_mask.placement,
)
model_kwargs["attention_mask"] = flow.cat([attention_mask, pad], dim=-1)
if "decoder_attn_mask" in model_kwargs and is_encoder_decoder:
attention_mask = model_kwargs["decoder_attn_mask"]
pad = flow.ones(
(attention_mask.shape[0], 1),
sbp=attention_mask.sbp,
placement=attention_mask.placement,
)
model_kwargs["decoder_attn_mask"] = flow.cat([attention_mask, pad], dim=-1)
return model_kwargs
def _reorder_cache(self, past, beam_idx):
raise NotImplementedError(
"Make sure that a `_reorder_cache` function is correctly implemented in "
f"{self.__class__.__module__} to enable beam search for {self.__class__}"
)
def _get_logits_warper(
self,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
typical_p: Optional[float] = None,
temperature: Optional[float] = None,
num_beams: Optional[int] = None,
renormalize_logits: Optional[bool] = None,
):
# instantiate warpers list
warpers = LogitsProcessorList()
# all samplers can be found in `generation_utils_samplers.py`
if temperature is not None and temperature != 1.0:
warpers.append(TemperatureLogitsWarper(temperature))
if top_k is not None and top_k != 0:
warpers.append(
TopKLogitsWarper(top_k=top_k, min_tokens_to_keep=(2 if num_beams > 1 else 1))
)
if top_p is not None and top_p < 1.0:
warpers.append(
TopPLogitsWarper(top_p=top_p, min_tokens_to_keep=(2 if num_beams > 1 else 1))
)
if typical_p is not None and typical_p < 1.0:
warpers.append(
TypicalLogitsWarper(mass=typical_p, min_tokens_to_keep=(2 if num_beams > 1 else 1))
)
# `LogitNormalization` should always be the last logit processor, when present
if renormalize_logits is True:
warpers.append(NormalizationLogitsProcessor())
return warpers
def _get_logits_processor(
self,
repetition_penalty: float,
no_repeat_ngram_size: int,
encoder_no_repeat_ngram_size: int,
input_ids_seq_length: int,
encoder_input_ids: flow.Tensor,
min_length: int,
max_length: int,
eos_token_id: int,
forced_bos_token_id: int,
forced_eos_token_id: int,
prefix_allowed_tokens_fn: Callable[[int, flow.Tensor], List[int]],
num_beams: int,
num_beam_groups: int,
diversity_penalty: float,
remove_invalid_values: bool,
exponential_decay_length_penalty: Tuple,
logits_processor: Optional[LogitsProcessorList],
renormalize_logits: Optional[bool],
):
"""
This class returns a [`LogitsProcessorList`] list object that contains all relevant
[`LogitsProcessor`] instances used to modify the scores of the language model head.
"""
processors = LogitsProcessorList()
# instantiate processors list
if diversity_penalty is not None and diversity_penalty > 0.0:
processors.append(
HammingDiversityLogitsProcessor(
diversity_penalty=diversity_penalty,
num_beams=num_beams,
num_beam_groups=num_beam_groups,
)
)
if repetition_penalty is not None and repetition_penalty != 1.0:
processors.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0:
processors.append(NoRepeatNGramLogitsProcessor(no_repeat_ngram_size))
if encoder_no_repeat_ngram_size is not None and encoder_no_repeat_ngram_size > 0:
if self.cfg.is_encoder_decoder:
processors.append(
EncoderNoRepeatNGramLogitsProcessor(
encoder_no_repeat_ngram_size, encoder_input_ids
)
)
else:
raise ValueError(
"It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only "
"architecture"
)
if min_length is not None and eos_token_id is not None and min_length > 0:
processors.append(MinLengthLogitsProcessor(min_length, eos_token_id))
if prefix_allowed_tokens_fn is not None:
processors.append(
PrefixConstrainedLogitsProcessor(
prefix_allowed_tokens_fn, num_beams // num_beam_groups
)
)
if forced_bos_token_id is not None:
processors.append(ForcedBOSTokenLogitsProcessor(forced_bos_token_id))
if forced_eos_token_id is not None:
processors.append(ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
if remove_invalid_values is True:
processors.append(InfNanRemoveLogitsProcessor())
if exponential_decay_length_penalty is not None:
processors.append(
ExponentialDecayLengthPenalty(
exponential_decay_length_penalty, eos_token_id, input_ids_seq_length
)
)
processors = self._merge_criteria_processor_list(processors, logits_processor)
# `LogitNormalization` should always be the last logit processor, when present
if renormalize_logits is True:
processors.append(NormalizationLogitsProcessor())
return processors
def _get_stopping_criteria(
self,
max_length: Optional[int],
max_time: Optional[float],
stopping_criteria: Optional[StoppingCriteriaList],
):
criteria = StoppingCriteriaList()
if max_length is not None:
criteria.append(MaxLengthCriteria(max_length=max_length))
if max_time is not None:
criteria.append(MaxTimeCriteria(max_time=max_time))
criteria = self._merge_criteria_processor_list(criteria, stopping_criteria)
return criteria
def _merge_criteria_processor_list(self, default_list, custom_list):
if len(custom_list) == 0:
return default_list
for default in default_list:
for custom in custom_list:
if type(custom) is type(default):
raise ValueError("Criteria repetition error.")
default_list.extend(custom_list)
return default_list
def compute_transition_beam_scores(
self,
sequences: flow.Tensor,
scores: Tuple[flow.Tensor],
beam_indices: flow.Tensor,
eos_token_id: int = None,
):
scores = flow.stack(scores).reshape(len(scores), -1).transpose(0, 1)
beam_indices_mask = beam_indices < 0
max_beam_length = (1 - beam_indices_mask.long()).sum(-1).max()
beam_indices = beam_indices[:, :max_beam_length]
beam_indices_mask = beam_indices_mask[:, :max_beam_length]
beam_indices[beam_indices_mask] = 0
beam_sequence_indices = beam_indices * self.cfg.vocab_size
cut_idx = sequences.shape[-1] - max_beam_length
indices = sequences[:, cut_idx:] + beam_sequence_indices
transition_scores = scores.gather(0, indices)
transition_scores[beam_indices_mask] = 0
return transition_scores
def _validate_model_kwargs(self, model_kwargs):
if self.cfg.is_encoder_decoder:
for key in ["decoder_input_ids"]:
model_kwargs.pop(key, None)
unused_model_args = []
model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
if "kwargs" in model_args:
model_args |= set(inspect.signature(self.forward).parameters)
for key, value in model_kwargs.items():
if value is not None and key not in model_args:
unused_model_args.append(key)
if unused_model_args:
raise ValueError(
f"The following `model_kwargs` are not used by the model: {unused_model_args} "
"(note: typos in the generate arguments will also show up in this list)"
)
def greedy_search(
self,
input_ids: flow.Tensor,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
max_length: Optional[int] = None,
pad_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None,
is_encoder_decoder: bool = False,
output_scores: bool = False,
**model_kwargs,
):
pad_token_id = pad_token_id if pad_token_id is not None else self.cfg.pad_token_id
eos_token_id = eos_token_id if eos_token_id is not None else self.cfg.eos_token_id
output_scores = output_scores if output_scores is not None else self.cfg.output_scores
scores = () if output_scores else None
logits_processor = (
logits_processor if logits_processor is not None else LogitsProcessorList()
)
stopping_criteria = (
stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
)
if max_length is not None:
warnings.warn(
"`max_length` is deprecated in this function, use MaxLengthCriteria" " instead.",
UserWarning,
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
# keep track of which sequences are already finished
unfinished_sequences = flow.ones(input_ids.shape[0])
cur_len = input_ids.shape[-1]
while True:
# prepare model inputs
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
# generate
outputs = self(**model_inputs)
next_token_logits = outputs["logits"][:, -1, :]
# logits_processor
next_token_scores = logits_processor(input_ids, next_token_logits)
# Store scores
if output_scores:
scores += (next_token_scores,)
# argmax
next_tokens = flow.argmax(next_token_scores, dim=-1)
next_tokens = next_tokens.to_global(placement=input_ids.placement)
unfinished_sequences = unfinished_sequences.to_global(
sbp=next_tokens.sbp, placement=next_tokens.placement
)
if eos_token_id is not None:
if pad_token_id is None:
raise ValueError(
"If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
)
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
1 - unfinished_sequences
)
next_tokens = next_tokens.to(flow.long)
input_ids = flow.cat([input_ids, next_tokens[:, None]], dim=-1)
model_kwargs = self._update_model_kwargs_for_generation(
outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
)
cur_len = cur_len + 1
# if eos_token was found in one sentence, set sentence to finished
if eos_token_id is not None:
unfinished_sequences = flow.mul(
unfinished_sequences, (next_tokens != eos_token_id).long()
)
if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
break
# Release records
if "past_key_values" in self.__dir__():
self.past_key_values = [None] * self.cfg.hidden_layers
if "encoder_states" in self.__dir__():
self.encoder_states = None
return input_ids
def multinomial_sample(
self,
input_ids: flow.Tensor,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
logits_warper: Optional[LogitsProcessorList] = None,
max_length: Optional[int] = None,
pad_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None,
is_encoder_decoder: bool = False,
output_scores: bool = False,
**model_kwargs,
):
# init values
pad_token_id = pad_token_id if pad_token_id is not None else self.cfg.pad_token_id
eos_token_id = eos_token_id if eos_token_id is not None else self.cfg.eos_token_id
output_scores = output_scores if output_scores is not None else self.cfg.output_scores
scores = () if output_scores else None
logits_processor = (
logits_processor if logits_processor is not None else LogitsProcessorList()
)
stopping_criteria = (
stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
)
if max_length is not None:
warnings.warn(
"`max_length` is deprecated in this function, use "
"`stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`"
"instead.",
UserWarning,
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
unfinished_sequences = flow.ones(input_ids.shape[0])
cur_len = input_ids.shape[-1]
while True:
# prepare model inputs
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
# generate
outputs = self(**model_inputs)
next_token_logits = outputs["logits"][:, -1, :]
# pre-process distribution
next_token_scores = logits_processor(input_ids, next_token_logits)
next_token_scores = logits_warper(input_ids, next_token_scores)
# Store scores
if output_scores:
scores += (next_token_scores,)
# sample
probs = nn.functional.softmax(next_token_scores, dim=-1)
probs = probs.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
).to_local()
next_tokens = flow.multinomial(probs, num_samples=1).squeeze(1)
next_tokens = next_tokens.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
unfinished_sequences = unfinished_sequences.to_global(
sbp=next_tokens.sbp, placement=next_tokens.placement
)
if eos_token_id is not None:
if pad_token_id is None:
raise ValueError(
"If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
)
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
1 - unfinished_sequences
)
next_tokens = next_tokens.to(flow.long)
input_ids = flow.cat([input_ids, next_tokens[:, None]], dim=-1)
model_kwargs = self._update_model_kwargs_for_generation(
outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
)
cur_len = cur_len + 1
if eos_token_id is not None:
unfinished_sequences = flow.mul(
unfinished_sequences, (next_tokens != eos_token_id).long()
)
if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
break
# Release records
if "past_key_values" in self.__dir__():
self.past_key_values = [None] * self.cfg.hidden_layers
if "encoder_states" in self.__dir__():
self.encoder_states = None
return input_ids
def beam_search(
self,
input_ids: flow.Tensor,
beam_scorer: BeamScorer,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
max_length: Optional[int] = None,
pad_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None,
is_encoder_decoder: bool = False,
output_scores: bool = False,
**model_kwargs,
):
pad_token_id = pad_token_id if pad_token_id is not None else self.cfg.pad_token_id
eos_token_id = eos_token_id if eos_token_id is not None else self.cfg.eos_token_id
output_scores = output_scores if output_scores is not None else self.cfg.output_scores
scores = () if output_scores else None
logits_processor = (
logits_processor if logits_processor is not None else LogitsProcessorList()
)
stopping_criteria = (
stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
)
if max_length is not None:
warnings.warn(
"`max_length` is deprecated in this function, use "
"`stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`"
"instead.",
UserWarning,
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
if len(stopping_criteria) == 0:
warnings.warn(
"You don't have defined any stopping_criteria, this will likely loop forever",
UserWarning,
)
batch_size = len(beam_scorer._beam_hyps)
num_beams = beam_scorer.num_beams
batch_beam_size, cur_len = input_ids.shape
if num_beams * batch_size != batch_beam_size:
raise ValueError(
f"Batch dimension of `input_ids` should be {num_beams * batch_size}, "
f"but is {batch_beam_size}."
)
beam_indices = None
beam_scores = flow.zeros(
(batch_size, num_beams),
dtype=flow.float,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=flow.placement("cuda", list(range(dist.get_world_size()))),
)
beam_scores[:, 1:] = -1e9
beam_scores = beam_scores.view((batch_size * num_beams,))
while True:
# prepare model inputs
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
outputs = self(**model_inputs)
next_token_logits = outputs["logits"][:, -1, :]
next_token_scores = nn.functional.log_softmax(
next_token_logits, dim=-1
) # (batch_size * num_beams, vocab_size)
next_token_scores = next_token_scores.to_global(
sbp=input_ids.sbp, placement=input_ids.placement
)
next_token_scores_processed = logits_processor(input_ids, next_token_scores)
next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
next_token_scores
)
# Store scores
if output_scores:
scores += (next_token_scores,)
# reshape for beam search
vocab_size = next_token_scores.shape[-1]
next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
next_token_scores, next_tokens = flow.topk(
next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
)
next_indices = next_tokens // vocab_size
next_tokens = next_tokens % vocab_size
beam_outputs = beam_scorer.process(
input_ids,
next_token_scores,
next_tokens,
next_indices,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
beam_indices=beam_indices,
)
beam_scores = beam_outputs["next_beam_scores"]
beam_next_tokens = beam_outputs["next_beam_tokens"]
beam_idx = beam_outputs["next_beam_indices"]
input_ids = flow.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
model_kwargs = self._update_model_kwargs_for_generation(
outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
)
# update past_key_value
if model_kwargs["past"] is not None:
model_kwargs["past"] = self._reorder_cache(beam_idx)
# increase cur_len
cur_len = cur_len + 1
if beam_scorer.is_done or stopping_criteria(input_ids, scores):
break
sequence_outputs = beam_scorer.finalize(
input_ids,
beam_scores,
next_tokens,
next_indices,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
max_length=stopping_criteria.max_length,
beam_indices=beam_indices,
)
# Release records
if "past_key_values" in self.__dir__():
self.past_key_values = [None] * self.cfg.hidden_layers
if "encoder_states" in self.__dir__():
self.encoder_states = None
return sequence_outputs["sequences"]
@flow.no_grad()
def generate(
self,
inputs: Optional[flow.Tensor] = None,
max_length: Optional[int] = None,
min_length: Optional[int] = None,
do_sample: Optional[bool] = None,
early_stopping: Optional[bool] = None,
num_beams: Optional[int] = None,
temperature: Optional[float] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
typical_p: Optional[float] = None,
repetition_penalty: Optional[float] = None,
force_words_ids: Optional[Union[Iterable[int], Iterable[Iterable[int]]]] = None,
bos_token_id: Optional[int] = None,
pad_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None,
length_penalty: Optional[float] = None,
no_repeat_ngram_size: Optional[int] = None,
encoder_no_repeat_ngram_size: Optional[int] = None,
num_return_sequences: Optional[int] = None,
max_time: Optional[float] = None,
max_new_tokens: Optional[int] = None,
decoder_start_token_id: Optional[int] = None,
use_cache: Optional[bool] = None,
num_beam_groups: Optional[int] = None,
diversity_penalty: Optional[float] = None,
prefix_allowed_tokens_fn: Optional[Callable[[int, flow.Tensor], List[int]]] = None,
logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
renormalize_logits: Optional[bool] = None,
stopping_criteria=StoppingCriteriaList(),
constraints=None,
output_scores: Optional[bool] = None,
forced_bos_token_id: Optional[int] = None,
forced_eos_token_id: Optional[int] = None,
remove_invalid_values: Optional[bool] = None,
exponential_decay_length_penalty: Optional[Tuple[Union[int, float]]] = None,
**model_kwargs,
):
# 0. Validate model kwargs
self._validate_model_kwargs(model_kwargs.copy())
# 1. Set generation parameters if not already defined
bos_token_id = bos_token_id if bos_token_id is not None else self.cfg.bos_token_id
num_beams = num_beams if num_beams is not None else self.cfg.num_beams
length_penalty = length_penalty if length_penalty is not None else self.cfg.length_penalty
early_stopping = early_stopping if early_stopping is not None else self.cfg.early_stopping
num_beam_groups = (
num_beam_groups if num_beam_groups is not None else self.cfg.num_beam_groups
)
do_sample = do_sample if do_sample is not None else self.cfg.do_sample
num_return_sequences = (
num_return_sequences
if num_return_sequences is not None
else self.cfg.num_return_sequences
)
pad_token_id = pad_token_id if pad_token_id is not None else self.cfg.pad_token_id
eos_token_id = eos_token_id if eos_token_id is not None else self.cfg.eos_token_id
output_scores = output_scores if output_scores is not None else self.cfg.output_scores
# 2. Prepare model inputs
inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
inputs, bos_token_id, model_kwargs
)
batch_size = inputs_tensor.shape[0]
# 3. Prepare other model kwargs
model_kwargs["use_cache"] = use_cache if use_cache is not None else self.cfg.use_cache
if self.cfg.is_encoder_decoder:
att_mask_name = "encoder_attn_mask"
accepts_attention_mask = att_mask_name in set(
inspect.signature(self.forward).parameters.keys()
)
else:
att_mask_name = "attention_mask"
accepts_attention_mask = att_mask_name in set(
inspect.signature(self.forward).parameters.keys()
)
requires_attention_mask = "encoder_outputs" not in model_kwargs
if (
model_kwargs.get(att_mask_name, None) is None
and requires_attention_mask
and accepts_attention_mask
):
model_kwargs[att_mask_name] = self._prepare_attention_mask_for_generation(
inputs_tensor, pad_token_id, eos_token_id
)
if self.cfg.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
# if model is encoder decoder encoder_outputs are created
# and added to `model_kwargs`
model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
inputs_tensor, model_kwargs, model_input_name
)
# 4. Prepare `input_ids` which will be used for auto-regressive generation
if self.cfg.is_encoder_decoder:
input_ids = self._prepare_decoder_input_ids_for_generation(
batch_size,
decoder_start_token_id=decoder_start_token_id,
bos_token_id=bos_token_id,
model_kwargs=model_kwargs,
)
else:
# if decoder-only then inputs_tensor has to be `input_ids`
input_ids = inputs_tensor
# 5. Prepare `max_length` depending on other stopping criteria.
input_ids_seq_length = input_ids.shape[-1]
if max_length is None and max_new_tokens is None:
if dist.is_main_process():
warnings.warn(
"Neither `max_length` nor `max_new_tokens` has been set, `max_length` will "
f"default to {self.cfg.max_length} (`self.cfg.max_length`). we recommend using"
" `max_new_tokens` to control the maximum length of the generation.",
UserWarning,
)
elif max_length is None and max_new_tokens is not None:
max_length = max_new_tokens + input_ids_seq_length
elif max_length is not None and max_new_tokens is not None:
raise ValueError(
"Both `max_new_tokens` and `max_length` have been set but they serve the same"
)
# default to cfg if still None
max_length = max_length if max_length is not None else self.cfg.max_length
min_length = min_length if min_length is not None else self.cfg.min_length
if min_length is not None and min_length > max_length:
raise ValueError(
f"Unfeasable length constraints: the minimum length ({min_length}) is larger than"
f"the maximum length ({max_length})"
)
if input_ids_seq_length >= max_length:
input_ids_string = "decoder_input_ids" if self.cfg.is_encoder_decoder else "input_ids"
logger.warning(
f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is"
f" set to {max_length}. This can lead to unexpected behavior. You should consider "
"increasing `max_new_tokens`."
)
# 6. Determine generation mode
is_constraint_gen_mode = constraints is not None or force_words_ids is not None
is_greedy_gen_mode = (
(num_beams == 1)
and (num_beam_groups == 1)
and do_sample is False
and not is_constraint_gen_mode
)
is_sample_gen_mode = (
(num_beams == 1)
and (num_beam_groups == 1)
and do_sample is True
and not is_constraint_gen_mode
)
is_beam_gen_mode = (
(num_beams > 1)
and (num_beam_groups == 1)
and do_sample is False
and not is_constraint_gen_mode
)
# is_beam_sample_gen_mode = (
# (num_beams > 1)
# and (num_beam_groups == 1)
# and do_sample is True
# and not is_constraint_gen_mode
# )
is_group_beam_gen_mode = (
(num_beams > 1) and (num_beam_groups > 1) and not is_constraint_gen_mode
)
if num_beam_groups > num_beams:
raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
if is_group_beam_gen_mode and do_sample is True:
raise ValueError(
"Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is"
" set to `False`."
)
# 7. Prepare distribution pre_processing samplers
logits_processor = self._get_logits_processor(
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=no_repeat_ngram_size,
encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
input_ids_seq_length=input_ids_seq_length,
encoder_input_ids=inputs_tensor,
min_length=min_length,
max_length=max_length,
eos_token_id=eos_token_id,
forced_bos_token_id=forced_bos_token_id,
forced_eos_token_id=forced_eos_token_id,
prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
num_beams=num_beams,
num_beam_groups=num_beam_groups,
diversity_penalty=diversity_penalty,
remove_invalid_values=remove_invalid_values,
exponential_decay_length_penalty=exponential_decay_length_penalty,
logits_processor=logits_processor,
renormalize_logits=renormalize_logits,
)
# 8. Prepare stopping criteria
stopping_criteria = self._get_stopping_criteria(
max_length=max_length, max_time=max_time, stopping_criteria=stopping_criteria
)
# 9. Go into different generation modes
if is_greedy_gen_mode:
if num_return_sequences > 1:
raise ValueError(
f"num_return_sequences has to be 1, but is {num_return_sequences} when doing"
" greedy search."
)
# 10. Run greedy search
return self.greedy_search(
input_ids,
logits_processor=logits_processor,
stopping_criteria=stopping_criteria,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
output_scores=output_scores,
**model_kwargs,
)
elif is_sample_gen_mode:
# 10. Prepare logits warper
logits_warper = self._get_logits_warper(
top_k=top_k,
top_p=top_p,
typical_p=typical_p,
temperature=temperature,
num_beams=num_beams,
renormalize_logits=renormalize_logits,
)
# 11. Expand input_ids with `num_return_sequences` additional sequences per batch
input_ids, model_kwargs = self._expand_inputs_for_generation(
input_ids,
expand_size=num_return_sequences,
is_encoder_decoder=self.cfg.is_encoder_decoder,
**model_kwargs,
)
# 12. Run multinomial sample
return self.multinomial_sample(
input_ids,
logits_processor=logits_processor,
logits_warper=logits_warper,
stopping_criteria=stopping_criteria,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
output_scores=output_scores,
**model_kwargs,
)
elif is_beam_gen_mode:
if num_return_sequences > num_beams:
raise ValueError(
"`num_return_sequences` has to be smaller or equal to `num_beams`."
)
if stopping_criteria.max_length is None:
raise ValueError("`max_length` needs to be a stopping_criteria for now.")
# 10. Prepare beam search scorer
beam_scorer = BeamSearchScorer(
batch_size=batch_size,
num_beams=num_beams,
length_penalty=length_penalty,
do_early_stopping=early_stopping,
num_beam_hyps_to_keep=num_return_sequences,
)
# 11. Interleave input_ids with `num_beams` additional sequences per batch
input_ids, model_kwargs = self._expand_inputs_for_generation(
input_ids,
expand_size=num_beams,
is_encoder_decoder=self.cfg.is_encoder_decoder,
**model_kwargs,
)
# 12. Run beam search
return self.beam_search(
input_ids,
beam_scorer,
logits_processor=logits_processor,
stopping_criteria=stopping_criteria,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
output_scores=output_scores,
**model_kwargs,
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import oneflow as flow
from PIL import Image
from libai.config import instantiate
from libai.data.structures import DistTensorData, Instance
from libai.inference.basic import BasePipeline
class ImageClassificationPipeline(BasePipeline):
def __init__(
self,
config_file,
data_parallel=None,
tensor_parallel=None,
pipeline_parallel=None,
pipeline_stage_id=None,
pipeline_num_layers=None,
model_path=None,
mode="libai",
**kwargs,
):
super().__init__(
config_file,
data_parallel,
tensor_parallel,
pipeline_parallel,
pipeline_stage_id,
pipeline_num_layers,
model_path,
mode,
**kwargs,
)
if "num_classes" in self.cfg.model:
self.num_classes = self.cfg.model.num_classes
elif "num_classes" in self.cfg.model.cfg:
self.num_classes = self.cfg.model.cfg.num_classes
else:
raise AttributeError("The model's config must contain num_classes")
label2id = self.label2id(self.num_classes)
self.id2label = {ind: label for label, ind in label2id.items()}
self.transform = instantiate(self.cfg.dataloader.test[0].dataset.transform)
def _parse_parameters(self, **pipeline_parameters):
preprocess_params = {}
forward_params = {}
postprocess_params = {**pipeline_parameters}
return preprocess_params, forward_params, postprocess_params
def preprocess(
self,
inputs,
**kwargs,
) -> dict:
assert os.path.exists(inputs), "inputs must be an existing image path!"
with open(inputs, "rb") as f:
img = Image.open(f).convert("RGB")
img = self.transform(img)
img = img.unsqueeze(0)
# to global tensor
model_input = Instance(
images=DistTensorData(img),
)
mdoel_input_dict = {}
for key, value in model_input.get_fields().items():
value.to_global()
mdoel_input_dict[key] = value.tensor
return mdoel_input_dict
def forward(self, mdoel_input_dict) -> dict:
model_outputs_dict = self.model(**mdoel_input_dict)
return model_outputs_dict
def postprocess(
self, model_outputs_dict, function_to_apply=None, return_all_scores=False, **kwargs
) -> dict:
# prepare
num_labels = self.num_classes
if function_to_apply is not None:
function_to_apply = function_to_apply.lower()
assert function_to_apply in [
"sigmoid",
"softmax",
"none",
], f"Unrecognized `function_to_apply` argument: {function_to_apply}"
else:
if num_labels == 1:
function_to_apply = "sigmoid"
elif num_labels > 1:
function_to_apply = "softmax"
# process, logits: [num_labels]
logits = model_outputs_dict["prediction_scores"][0]
if function_to_apply == "sigmoid":
scores = flow.sigmoid(logits)
elif function_to_apply == "softmax":
scores = flow.softmax(logits)
else:
scores = logits
scores = scores.detach().numpy()
if return_all_scores:
return [
{"label": self.id2label[i], "score": score.item()} for i, score in enumerate(scores)
]
else:
return {
"label": self.id2label[scores.argmax().item()],
"score": scores.max().item(),
}
def label2id(self, num_classes):
"""
Args:
num_classes (int): the number of total classes
Returns:
labels (list): a dict contains all the labels for inference,
each item should be the form as follows:
{
"tench": 0,
"tiger": 1,
"xxx", n,
}
"""
from libai.inference.utils.imagenet_class import IMAGENET_LABELS as labels
assert num_classes == len(labels), "number of labels must be equal to num_classes"
return {label: i for (i, label) in enumerate(labels)}
if __name__ == "__main__":
pipeline = ImageClassificationPipeline("/home/chengpeng/config.yaml", 1, 1, 1)
print(pipeline("data_test/inference_test_data/ILSVRC2012_val_00000293.JPEG"))
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import oneflow as flow
from libai.data.structures import DistTensorData, Instance
from libai.inference.basic import BasePipeline
class TextClassificationPipeline(BasePipeline):
def __init__(
self,
config_file,
data_parallel=None,
tensor_parallel=None,
pipeline_parallel=None,
pipeline_stage_id=None,
pipeline_num_layers=None,
model_path=None,
mode="libai",
**kwargs,
):
super().__init__(
config_file,
data_parallel,
tensor_parallel,
pipeline_parallel,
pipeline_stage_id,
model_path,
pipeline_num_layers,
mode,
**kwargs,
)
def update_cfg(
self,
data_parallel=1,
tensor_parallel=1,
pipeline_parallel=1,
pipeline_stage_id=None,
pipeline_num_layers=None,
):
super().update_cfg(
data_parallel,
tensor_parallel,
pipeline_parallel,
pipeline_stage_id,
pipeline_num_layers,
)
self.cfg.model.cfg.hidden_dropout_prob = 0.0
self.cfg.model.cfg.attention_probs_dropout_prob = 0.0
assert "num_labels" in self.cfg.model.cfg, "The model's config must contain num_labels"
if "label2id" not in self.cfg.model.cfg:
label2id = {"Label_" + str(i): i for i in range(self.cfg.model.cfg.num_labels)}
id2label = {ind: label for label, ind in label2id.items()}
self.cfg.model.cfg["label2id"] = label2id
self.cfg.model.cfg["id2label"] = id2label
def _parse_parameters(self, **pipeline_parameters):
preprocess_params = {}
forward_params = {}
postprocess_params = {**pipeline_parameters}
return preprocess_params, forward_params, postprocess_params
def preprocess(
self,
inputs,
pad: bool = False,
**kwargs,
) -> dict:
# tokenizer encoder
input_ids = flow.tensor(np.array(self.tokenizer.encode(inputs)))
padding_mask = flow.tensor(np.ones(input_ids.shape), dtype=flow.bool)
# set batch size = 1
input_ids = input_ids.unsqueeze(0)
padding_mask = padding_mask.unsqueeze(0)
# to global tensor
model_input = Instance(
input_ids=DistTensorData(input_ids),
attention_mask=DistTensorData(padding_mask),
)
mdoel_input_dict = {}
for key, value in model_input.get_fields().items():
value.to_global()
mdoel_input_dict[key] = value.tensor
return mdoel_input_dict
def forward(self, mdoel_input_dict) -> dict:
model_outputs_dict = self.model(**mdoel_input_dict)
return model_outputs_dict
def postprocess(
self, model_outputs_dict, function_to_apply=None, return_all_scores=False, **kwargs
) -> dict:
# prepare
num_labels = self.cfg.model.cfg.num_labels
if function_to_apply is not None:
function_to_apply = function_to_apply.lower()
assert function_to_apply in [
"sigmoid",
"softmax",
"none",
], f"Unrecognized `function_to_apply` argument: {function_to_apply}"
else:
if num_labels == 1:
function_to_apply = "sigmoid"
elif num_labels > 1:
function_to_apply = "softmax"
# process, logits: [num_labels]
logits = model_outputs_dict["logits"][0]
if function_to_apply == "sigmoid":
scores = flow.sigmoid(logits)
elif function_to_apply == "softmax":
scores = flow.softmax(logits)
else:
scores = logits
scores = scores.detach().numpy()
if return_all_scores:
return [
{"label": self.cfg.model.cfg.id2label[i], "score": score.item()}
for i, score in enumerate(scores)
]
else:
return {
"label": self.cfg.model.cfg.id2label[scores.argmax().item()],
"score": scores.max().item(),
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment