Unverified Commit d216e798 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #777 from huggingface/examples

Working GLUE Example for XLNet (STS-B) 
parents ed6c8d37 6135de2f
......@@ -1620,20 +1620,10 @@ and unpack it to some directory `$GLUE_DIR`.
```shell
export GLUE_DIR=/path/to/glue
python run_xlnet_classifier.py \
--task_name STS-B \
--do_train \
--do_eval \
--data_dir $GLUE_DIR/STS-B/ \
--max_seq_length 128 \
--train_batch_size 8 \
--gradient_accumulation_steps 1 \
--learning_rate 5e-5 \
--num_train_epochs 3.0 \
--output_dir /tmp/mrpc_output/
CUDA_VISIBLE_DEVICES=0,1,2,3 python ./examples/run_glue.py --do_train --task_name=sts-b --data_dir=${GLUE_DIR}/STS-B --output_dir=./proc_data/sts-b-110 --max_seq_length=128 --per_gpu_eval_batch_size=8 --per_gpu_train_batch_size=8 --max_steps=1200 --model_name=xlnet-large-cased --overwrite_output_dir --overwrite_cache --warmup_steps=120
```
Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus) gave evaluation results between 84% and 88%.
This hyper-parameters give evaluation results pearsonr > 0.918.
### Distributed training
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -21,6 +21,7 @@ import csv
import logging
import os
import sys
from io import open
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
......
......@@ -36,7 +36,7 @@ from .modeling_xlm import (XLMConfig, XLMModel,
from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
from .optimization import BertAdam
from .optimization_openai import OpenAIAdam
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
......@@ -73,17 +73,17 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
import numpy as np
import tensorflow as tf
except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
raise
tf_path = os.path.abspath(tf_checkpoint_path)
print("Converting TensorFlow checkpoint from {}".format(tf_path))
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape))
logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
......@@ -93,7 +93,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
print("Skipping {}".format("/".join(name)))
logger.info("Skipping {}".format("/".join(name)))
continue
pointer = model
for m_name in name:
......@@ -113,7 +113,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
try:
pointer = getattr(pointer, l[0])
except AttributeError:
print("Skipping {}".format("/".join(name)))
logger.info("Skipping {}".format("/".join(name)))
continue
if len(l) >= 2:
num = int(l[1])
......@@ -127,7 +127,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
print("Initialize PyTorch weight {}".format(name))
logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
return model
......
......@@ -49,17 +49,17 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
import numpy as np
import tensorflow as tf
except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
raise
tf_path = os.path.abspath(gpt2_checkpoint_path)
print("Converting TensorFlow checkpoint from {}".format(tf_path))
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape))
logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array.squeeze())
......@@ -90,7 +90,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
print("Initialize PyTorch weight {}".format(name))
logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
return model
......
......@@ -110,7 +110,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
print("Initialize PyTorch weight {}".format(name))
logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
return model
......
......@@ -126,7 +126,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
import numpy as np
import tensorflow as tf
except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
raise
# Build TF to PyTorch weights loading map
......@@ -136,7 +136,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
init_vars = tf.train.list_variables(tf_path)
tf_weights = {}
for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape))
logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array
......@@ -157,7 +157,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
except AssertionError as e:
e.args += (p_i.shape, arr_i.shape)
raise
print("Initialize PyTorch weight {} for layer {}".format(name, i))
logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
p_i.data = torch.from_numpy(arr_i)
else:
try:
......@@ -165,13 +165,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
print("Initialize PyTorch weight {}".format(name))
logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None)
tf_weights.pop(name + '/Adam', None)
tf_weights.pop(name + '/Adam_1', None)
print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
return model
......
......@@ -272,7 +272,6 @@ class LogUniformSampler(object):
self.range_max = range_max
log_indices = torch.arange(1., range_max+2., 1.).log_()
self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
# print('P', self.dist.numpy().tolist()[-30:])
self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
......@@ -331,72 +330,3 @@ def sample_logits(embedding, bias, labels, inputs, sampler):
logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
return logits
# class LogUniformSampler(object):
# def __init__(self, range_max, unique=False):
# """
# Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
# `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
# """
# self.range_max = range_max
# log_indices = torch.arange(1., range_max+2., 1.).log_()
# self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
# self.unique = unique
# if self.unique:
# self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
# def sample(self, n_sample, labels):
# pos_sample, new_labels = labels.unique(return_inverse=True)
# n_pos_sample = pos_sample.size(0)
# n_neg_sample = n_sample - n_pos_sample
# if self.unique:
# self.exclude_mask.index_fill_(0, pos_sample, 1)
# sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
# self.exclude_mask.index_fill_(0, pos_sample, 0)
# else:
# sample_dist = self.dist
# neg_sample = torch.multinomial(sample_dist, n_neg_sample)
# sample = torch.cat([pos_sample, neg_sample])
# sample_prob = self.dist[sample]
# return new_labels, sample, sample_prob
if __name__ == '__main__':
S, B = 3, 4
n_vocab = 10000
n_sample = 5
H = 32
labels = torch.LongTensor(S, B).random_(0, n_vocab)
# sampler = LogUniformSampler(n_vocab, unique=False)
# new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
# true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
# print('true_probs', true_probs.numpy().tolist())
# print('samp_probs', samp_probs.numpy().tolist())
# print('neg_samples', neg_samples.numpy().tolist())
# print('sum', torch.sum(sampler.dist).item())
# assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
embedding = nn.Embedding(n_vocab, H)
bias = torch.zeros(n_vocab)
inputs = torch.Tensor(S, B, H).normal_()
logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
print('logits', logits.detach().numpy().tolist())
print('logits shape', logits.size())
print('out_labels', out_labels.detach().numpy().tolist())
print('out_labels shape', out_labels.size())
......@@ -57,16 +57,18 @@ class PretrainedConfig(object):
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `xlnet-large-cased`
- a path or url to a pretrained model archive containing:
. `config.json` a configuration file for the model
- a path or url to a directory containing a configuration file `config.json` for the model,
- a path or url to a configuration file for the model.
cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached.
"""
cache_dir = kwargs.pop('cache_dir', None)
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
else:
elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
else:
config_file = pretrained_model_name_or_path
# redirect to the cache, if necessary
try:
resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
......@@ -102,7 +104,7 @@ class PretrainedConfig(object):
for key in to_remove:
kwargs.pop(key, None)
logger.info("Model config {}".format(config))
logger.info("Model config %s", config)
return config
@classmethod
......@@ -200,6 +202,7 @@ class PreTrainedModel(nn.Module):
- a path or url to a tensorflow pretrained model checkpoint containing:
. `config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
config: an optional configuration for the model
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use
......@@ -207,23 +210,31 @@ class PreTrainedModel(nn.Module):
*inputs, **kwargs: additional input for the specific XLNet class
(ex: num_labels for XLNetForSequenceClassification)
"""
config = kwargs.pop('config', None)
state_dict = kwargs.pop('state_dict', None)
cache_dir = kwargs.pop('cache_dir', None)
from_tf = kwargs.pop('from_tf', False)
output_loading_info = kwargs.pop('output_loading_info', False)
# Load config
config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
if config is None:
config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# Load model
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
else:
elif os.path.isdir(pretrained_model_name_or_path):
if from_tf:
# Directly load from a TensorFlow checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
else:
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
if from_tf:
# Directly load from a TensorFlow checkpoint
archive_file = pretrained_model_name_or_path + ".index"
else:
archive_file = pretrained_model_name_or_path
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
......
......@@ -122,14 +122,14 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
import numpy as np
import tensorflow as tf
except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
raise
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
tf_weights = {}
for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape))
logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array
......@@ -137,15 +137,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
for name, pointer in tf_to_pt_map.items():
print("Importing {}".format(name))
logger.info("Importing {}".format(name))
if name not in tf_weights:
print("{} not in tf pre-trained weights, skipping".format(name))
logger.info("{} not in tf pre-trained weights, skipping".format(name))
continue
array = tf_weights[name]
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
print("Transposing")
logger.info("Transposing")
array = np.transpose(array)
if isinstance(pointer, list):
# Here we will split the TF weigths
......@@ -157,7 +157,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
except AssertionError as e:
e.args += (p_i.shape, arr_i.shape)
raise
print("Initialize PyTorch weight {} for layer {}".format(name, i))
logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
p_i.data = torch.from_numpy(arr_i)
else:
try:
......@@ -165,13 +165,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
print("Initialize PyTorch weight {}".format(name))
logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None)
tf_weights.pop(name + '/Adam', None)
tf_weights.pop(name + '/Adam_1', None)
print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
return model
......@@ -211,10 +211,6 @@ class XLNetConfig(PretrainedConfig):
layer_norm_eps=1e-12,
dropout=0.1,
dropatt=0.1,
init="normal",
init_range=0.1,
init_std=0.02,
mem_len=None,
reuse_len=None,
bi_data=False,
......@@ -258,11 +254,6 @@ class XLNetConfig(PretrainedConfig):
dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
init_std: float, initialize the parameters with a normal distribution
with mean 0 and stddev init_std. Only effective when init="normal".
mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future.
......@@ -297,11 +288,7 @@ class XLNetConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.init = init
self.init_range = init_range
self.init_std = init_std
self.dropout = dropout
self.dropatt = dropatt
self.mem_len = mem_len
self.reuse_len = reuse_len
self.bi_data = bi_data
......@@ -393,7 +380,7 @@ class XLNetRelativeAttention(nn.Module):
x = x[1:, ...]
x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
# x = x[:, 0:klen, :, :]
x = torch.index_select(x, 1, torch.arange(klen))
x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
return x
......
......@@ -14,174 +14,92 @@
# limitations under the License.
"""PyTorch optimization for BERT model."""
import logging
import math
import torch
from torch.optim import Optimizer
from torch.optim.optimizer import required
from torch.nn.utils import clip_grad_norm_
import logging
import abc
import sys
from torch.optim.lr_scheduler import LambdaLR
logger = logging.getLogger(__name__)
class ConstantLRSchedule(LambdaLR):
def __init__(self, optimizer, last_epoch=-1):
super(ConstantLRSchedule, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
if sys.version_info >= (3, 4):
ABC = abc.ABC
else:
ABC = abc.ABCMeta('ABC', (), {})
class _LRSchedule(ABC):
""" Parent of all LRSchedules here. """
warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense
def __init__(self, warmup=0.002, t_total=-1, **kw):
"""
:param warmup: what fraction of t_total steps will be used for linear warmup
:param t_total: how many training steps (updates) are planned
:param kw:
"""
super(_LRSchedule, self).__init__(**kw)
if t_total < 0:
logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
if not 0.0 <= warmup < 1.0 and not warmup == -1:
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
warmup = max(warmup, 0.)
self.warmup, self.t_total = float(warmup), float(t_total)
self.warned_for_t_total_at_progress = -1
def get_lr(self, step, nowarn=False):
"""
:param step: which of t_total steps we're on
:param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps
:return: learning rate multiplier for current update
"""
if self.t_total < 0:
return 1.
progress = float(step) / self.t_total
ret = self.get_lr_(progress)
# warning for exceeding t_total (only active with warmup_linear
if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
logger.warning(
"Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
.format(ret, self.__class__.__name__))
self.warned_for_t_total_at_progress = progress
# end warning
return ret
@abc.abstractmethod
def get_lr_(self, progress):
"""
:param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress
:return: learning rate multiplier for current update
"""
return 1.
class ConstantLR(_LRSchedule):
def get_lr_(self, progress):
return 1.
class WarmupCosineSchedule(_LRSchedule):
class WarmupCosineSchedule(LambdaLR):
"""
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
Linearly increases learning rate from 0 to 1 over `warmup` training steps.
Decreases learning rate from 1. to 0. over remaining `t_total - warmup` steps following a cosine curve.
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
:param warmup: see LRSchedule
:param t_total: see LRSchedule
:param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
:param kw:
"""
warn_t_total = True
def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
"""
:param warmup: see LRSchedule
:param t_total: see LRSchedule
:param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
:param kw:
"""
super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
self.cycles = cycles
def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
def lr_lambda(step):
if step < warmup_steps:
return step / max(1, warmup_steps)
else:
progress = (step - warmup_steps) / max(1, t_total - warmup_steps) # progress after warmup
return 0.5 * (1. + math.cos(math.pi * cycles * 2 * progress))
super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
class WarmupCosineWithHardRestartsSchedule(LambdaLR):
"""
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
learning rate (with hard restarts).
"""
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
assert(cycles >= 1.)
def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
return ret
def lr_lambda(step):
if step < warmup_steps:
return step / max(1, warmup_steps)
else:
progress = (step - warmup_steps) / max(1, t_total - warmup_steps) # progress after warmup
ret = 0.5 * (1. + math.cos(math.pi * ((cycles * progress) % 1)))
return ret
super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
"""
All training progress is divided in `cycles` (default=1.) parts of equal length.
Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
followed by a learning rate decreasing from 1. to 0. following a cosine curve.
"""
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
assert(warmup * cycles < 1.)
warmup = warmup * cycles if warmup >= 0 else warmup
super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
def get_lr_(self, progress):
progress = progress * self.cycles % 1.
if progress < self.warmup:
return progress / self.warmup
else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
ret = 0.5 * (1. + math.cos(math.pi * progress))
return ret
class WarmupConstantSchedule(_LRSchedule):
class WarmupConstantSchedule(LambdaLR):
"""
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
Keeps learning rate equal to 1. after warmup.
"""
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
return 1.
def __init__(self, optimizer, warmup_steps, last_epoch=-1):
def lr_lambda(step):
if step < warmup_steps:
return step / warmup_steps
return 1.
super(WarmupConstantSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupLinearSchedule(_LRSchedule):
class WarmupLinearSchedule(LambdaLR):
"""
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
"""
warn_t_total = True
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
return max((progress - 1.) / (self.warmup - 1.), 0.)
def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
def lr_lambda(step):
if step < warmup_steps:
return step / max(1, warmup_steps)
return (t_total - step) / max(1, t_total - warmup_steps)
SCHEDULES = {
None: ConstantLR,
"none": ConstantLR,
"warmup_cosine": WarmupCosineSchedule,
"warmup_constant": WarmupConstantSchedule,
"warmup_linear": WarmupLinearSchedule
}
super(WarmupLinearSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
class BertAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix.
class AdamW(Optimizer):
""" Implements Adam algorithm with weight decay fix.
Parameters:
lr: learning rate
......@@ -197,43 +115,20 @@ class BertAdam(Optimizer):
e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
correct_bias: can be set to False to avoid correcting bias in Adam (e.g. like in Bert repository)
"""
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
if lr is not required and lr < 0.0:
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01, correct_bias=True):
if lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1] ))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
# initialize schedule object
if not isinstance(schedule, _LRSchedule):
schedule_type = SCHEDULES[schedule]
schedule = schedule_type(warmup=warmup, t_total=t_total)
else:
if warmup != -1 or t_total != -1:
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object.")
defaults = dict(lr=lr, schedule=schedule,
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
super(BertAdam, self).__init__(params, defaults)
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
lr.append(lr_scheduled)
return lr
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
correct_bias=correct_bias)
super(AdamW, self).__init__(params, defaults)
def step(self, closure=None):
"""Performs a single optimization step.
......@@ -260,22 +155,28 @@ class BertAdam(Optimizer):
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['next_m'] = torch.zeros_like(p.data)
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['next_v'] = torch.zeros_like(p.data)
state['exp_avg_sq'] = torch.zeros_like(p.data)
next_m, next_v = state['next_m'], state['next_v']
beta1, beta2 = group['b1'], group['b2']
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
# Add grad clipping
if group['max_grad_norm'] > 0:
clip_grad_norm_(p, group['max_grad_norm'])
state['step'] += 1
# Decay the first and second moment running average coefficient
# In-place operations to update the averages at the same time
next_m.mul_(beta1).add_(1 - beta1, grad)
next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
update = next_m / (next_v.sqrt() + group['e'])
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['eps'])
step_size = group['lr']
if group['correct_bias']: # No bias correction for Bert
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
......@@ -284,20 +185,8 @@ class BertAdam(Optimizer):
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if group['weight_decay'] > 0.0:
update += group['weight_decay'] * p.data
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
update_with_lr = lr_scheduled * update
p.data.add_(-update_with_lr)
state['step'] += 1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# No bias correction
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']
# Add weight decay at the end (fixed version)
if group['weight_decay'] > 0:
p.data.add_(-group['lr'] * group['weight_decay'], p.data)
return loss
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for OpenAI GPT model."""
import math
import torch
from torch.optim import Optimizer
from torch.optim.optimizer import required
from torch.nn.utils import clip_grad_norm_
import logging
from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \
WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule
logger = logging.getLogger(__name__)
class OpenAIAdam(Optimizer):
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""
def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
vector_l2=False, max_grad_norm=-1, **kwargs):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
# initialize schedule object
if not isinstance(schedule, _LRSchedule):
schedule_type = SCHEDULES[schedule]
schedule = schedule_type(warmup=warmup, t_total=t_total)
else:
if warmup != -1 or t_total != -1:
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object.")
defaults = dict(lr=lr, schedule=schedule,
b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
max_grad_norm=max_grad_norm)
super(OpenAIAdam, self).__init__(params, defaults)
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
lr.append(lr_scheduled)
return lr
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['b1'], group['b2']
state['step'] += 1
# Add grad clipping
if group['max_grad_norm'] > 0:
clip_grad_norm_(p, group['max_grad_norm'])
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['e'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
# Add weight decay at the end (fixed version)
if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
return loss
......@@ -20,10 +20,9 @@ import unittest
import torch
from pytorch_transformers import BertAdam
from pytorch_transformers import OpenAIAdam
from pytorch_transformers.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
import numpy as np
......@@ -34,12 +33,12 @@ class OptimizationTest(unittest.TestCase):
for a, b in zip(list1, list2):
self.assertAlmostEqual(a, b, delta=tol)
def test_adam(self):
def test_adam_w(self):
w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
target = torch.tensor([0.4, 0.2, -0.5])
criterion = torch.nn.MSELoss()
# No warmup, constant schedule, no gradient clipping
optimizer = BertAdam(params=[w], lr=2e-1,
optimizer = AdamW(params=[w], lr=2e-1,
weight_decay=0.0,
max_grad_norm=-1)
for _ in range(100):
......@@ -52,23 +51,13 @@ class OptimizationTest(unittest.TestCase):
class ScheduleInitTest(unittest.TestCase):
def test_bert_sched_init(self):
m = torch.nn.Linear(50, 50)
optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
# shouldn't fail
def test_openai_sched_init(self):
def test_sched_init(self):
m = torch.nn.Linear(50, 50)
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
optim = AdamW(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
# shouldn't fail
......
......@@ -98,14 +98,14 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
self.build_vocab()
def count_file(self, path, verbose=False, add_eos=False):
if verbose: print('counting file {} ...'.format(path))
if verbose: logger.info('counting file {} ...'.format(path))
assert os.path.exists(path)
sents = []
with open(path, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx))
logger.info(' line {}'.format(idx))
symbols = self.tokenize(line, add_eos=add_eos)
self.counter.update(symbols)
sents.append(symbols)
......@@ -116,10 +116,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
"""
sents : a list of sentences, each a list of tokenized symbols
"""
if verbose: print('counting {} sents ...'.format(len(sents)))
if verbose: logger.info('counting {} sents ...'.format(len(sents)))
for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx))
logger.info(' line {}'.format(idx))
self.counter.update(symbols)
def _build_from_file(self, vocab_file):
......@@ -147,11 +147,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def build_vocab(self):
if self.vocab_file:
print('building vocab from {}'.format(self.vocab_file))
logger.info('building vocab from {}'.format(self.vocab_file))
self._build_from_file(self.vocab_file)
print('final vocab size {}'.format(len(self)))
logger.info('final vocab size {}'.format(len(self)))
else:
print('building vocab with min_freq={}, max_size={}'.format(
logger.info('building vocab with min_freq={}, max_size={}'.format(
self.min_freq, self.max_size))
self.idx2sym = []
self.sym2idx = OrderedDict()
......@@ -163,18 +163,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if cnt < self.min_freq: break
self.add_symbol(sym)
print('final vocab size {} from {} unique tokens'.format(
logger.info('final vocab size {} from {} unique tokens'.format(
len(self), len(self.counter)))
def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
add_double_eos=False):
if verbose: print('encoding file {} ...'.format(path))
if verbose: logger.info('encoding file {} ...'.format(path))
assert os.path.exists(path)
encoded = []
with open(path, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx))
logger.info(' line {}'.format(idx))
symbols = self.tokenize(line, add_eos=add_eos,
add_double_eos=add_double_eos)
encoded.append(self.convert_to_tensor(symbols))
......@@ -185,11 +185,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return encoded
def encode_sents(self, sents, ordered=False, verbose=False):
if verbose: print('encoding {} sents ...'.format(len(sents)))
if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
encoded = []
for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx))
logger.info(' line {}'.format(idx))
encoded.append(self.convert_to_tensor(symbols))
if ordered:
......@@ -218,7 +218,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if sym in self.sym2idx:
return self.sym2idx[sym]
else:
# print('encounter unk {}'.format(sym))
# logger.info('encounter unk {}'.format(sym))
# assert '<eos>' not in sym
if hasattr(self, 'unk_idx'):
return self.sym2idx.get(sym, self.unk_idx)
......@@ -544,14 +544,14 @@ def get_lm_corpus(datadir, dataset):
fn = os.path.join(datadir, 'cache.pt')
fn_pickle = os.path.join(datadir, 'cache.pkl')
if os.path.exists(fn):
print('Loading cached dataset...')
logger.info('Loading cached dataset...')
corpus = torch.load(fn_pickle)
elif os.path.exists(fn):
print('Loading cached dataset from pickle...')
logger.info('Loading cached dataset from pickle...')
with open(fn, "rb") as fp:
corpus = pickle.load(fp)
else:
print('Producing dataset {}...'.format(dataset))
logger.info('Producing dataset {}...'.format(dataset))
kwargs = {}
if dataset in ['wt103', 'wt2']:
kwargs['special'] = ['<eos>']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment