Commit b21d84b0 authored by thomwolf's avatar thomwolf
Browse files

update examples

parent ec07cf5a
This diff is collapsed.
...@@ -74,8 +74,8 @@ def train(args, train_dataset, model, tokenizer): ...@@ -74,8 +74,8 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
schedule = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
if args.fp16: if args.fp16:
try: try:
from apex import amp from apex import amp
...@@ -300,6 +300,8 @@ def main(): ...@@ -300,6 +300,8 @@ def main():
help="The initial learning rate for Adam.") help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, parser.add_argument("--weight_decay", default=0.0, type=float,
help="Weight deay if we apply some.") help="Weight deay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, parser.add_argument("--max_grad_norm", default=1.0, type=float,
help="Max gradient norm.") help="Max gradient norm.")
parser.add_argument("--num_train_epochs", default=3.0, type=float, parser.add_argument("--num_train_epochs", default=3.0, type=float,
...@@ -358,7 +360,9 @@ def main(): ...@@ -358,7 +360,9 @@ def main():
args.device = device args.device = device
# Setup logging # Setup logging
logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
......
This diff is collapsed.
# Copyright (c) 2019-present, the HuggingFace Inc. authors.
# All rights reserved. This source code is licensed under the BSD-style
# license found in the LICENSE file in the root directory of this source tree.
import logging
import os
from tqdm import tqdm
from pprint import pformat
import torch
from ignite.engine import Engine, Events
from ignite.handlers import ModelCheckpoint
from ignite.metrics import RunningAverage
from ignite.contrib.handlers import ProgressBar
from ignite.contrib.handlers.tensorboard_logger import OptimizerParamsHandler, OutputHandler, TensorboardLogger
def average_distributed_scalar(scalar, args):
""" Average a scalar over nodes if we are in distributed training.
We use this for distributed evaluation.
Beware, such averages only works for metrics which are additive with regard
to the evaluation dataset, e.g. accuracy, log probabilities.
Doesn't work for ratio metrics like F1.
"""
if args.local_rank == -1:
return scalar
scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
return scalar_t.item()
def add_logging_and_checkpoint_saving(trainer, evaluator, metrics, model, optimizer, args, prefix=""):
""" Add to a PyTorch ignite training engine tensorboard logging,
progress bar with average loss, checkpoint saving and save training config.
"""
# Add progress bar with average loss
RunningAverage(output_transform=lambda x: x).attach(trainer, prefix + "loss")
pbar = ProgressBar(persist=True)
pbar.attach(trainer, metric_names=[prefix + "loss"])
evaluator.add_event_handler(Events.COMPLETED, lambda _:
pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
# Add tensorboard logging with training and evaluation metrics
tb_logger = TensorboardLogger(log_dir=None)
tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=[prefix + "loss"]),
event_name=Events.ITERATION_COMPLETED)
tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer),
event_name=Events.ITERATION_STARTED)
@evaluator.on(Events.COMPLETED)
def tb_log_metrics(engine):
for name in metrics.keys():
tb_logger.writer.add_scalar(name, engine.state.metrics[name], trainer.state.iteration)
# Add checkpoint saving after each epoch - take care of distributed encapsulation ('getattr()')
checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})
# Save training configuration
torch.save(args, os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
return checkpoint_handler, tb_logger
...@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) ...@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
class ConstantLRSchedule(LambdaLR): class ConstantLRSchedule(LambdaLR):
def __init__(self, optimizer, last_epoch=-1): def __init__(self, optimizer, last_epoch=-1):
super(ConstantLR, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch) super(ConstantLRSchedule, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
class WarmupCosineSchedule(LambdaLR): class WarmupCosineSchedule(LambdaLR):
""" """
...@@ -128,7 +128,7 @@ class AdamW(Optimizer): ...@@ -128,7 +128,7 @@ class AdamW(Optimizer):
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
correct_bias=correct_bias) correct_bias=correct_bias)
super(BertAdam, self).__init__(params, defaults) super(AdamW, self).__init__(params, defaults)
def step(self, closure=None): def step(self, closure=None):
"""Performs a single optimization step. """Performs a single optimization step.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment