"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "6f877d9daf36788bad4fd228930939fed6ab12bd"
Unverified Commit d216e798 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #777 from huggingface/examples

Working GLUE Example for XLNet (STS-B) 
parents ed6c8d37 6135de2f
...@@ -1620,20 +1620,10 @@ and unpack it to some directory `$GLUE_DIR`. ...@@ -1620,20 +1620,10 @@ and unpack it to some directory `$GLUE_DIR`.
```shell ```shell
export GLUE_DIR=/path/to/glue export GLUE_DIR=/path/to/glue
python run_xlnet_classifier.py \ CUDA_VISIBLE_DEVICES=0,1,2,3 python ./examples/run_glue.py --do_train --task_name=sts-b --data_dir=${GLUE_DIR}/STS-B --output_dir=./proc_data/sts-b-110 --max_seq_length=128 --per_gpu_eval_batch_size=8 --per_gpu_train_batch_size=8 --max_steps=1200 --model_name=xlnet-large-cased --overwrite_output_dir --overwrite_cache --warmup_steps=120
--task_name STS-B \
--do_train \
--do_eval \
--data_dir $GLUE_DIR/STS-B/ \
--max_seq_length 128 \
--train_batch_size 8 \
--gradient_accumulation_steps 1 \
--learning_rate 5e-5 \
--num_train_epochs 3.0 \
--output_dir /tmp/mrpc_output/
``` ```
Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus) gave evaluation results between 84% and 88%. This hyper-parameters give evaluation results pearsonr > 0.918.
### Distributed training ### Distributed training
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -21,6 +21,7 @@ import csv ...@@ -21,6 +21,7 @@ import csv
import logging import logging
import os import os
import sys import sys
from io import open
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score from sklearn.metrics import matthews_corrcoef, f1_score
......
...@@ -36,7 +36,7 @@ from .modeling_xlm import (XLMConfig, XLMModel, ...@@ -36,7 +36,7 @@ from .modeling_xlm import (XLMConfig, XLMModel,
from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME, from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
PretrainedConfig, PreTrainedModel, prune_layer, Conv1D) PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
from .optimization import BertAdam from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
from .optimization_openai import OpenAIAdam WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path) from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
...@@ -73,17 +73,17 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): ...@@ -73,17 +73,17 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.") "https://www.tensorflow.org/install/ for installation instructions.")
raise raise
tf_path = os.path.abspath(tf_checkpoint_path) tf_path = os.path.abspath(tf_checkpoint_path)
print("Converting TensorFlow checkpoint from {}".format(tf_path)) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model # Load weights from TF model
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
names = [] names = []
arrays = [] arrays = []
for name, shape in init_vars: for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape)) logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
names.append(name) names.append(name)
arrays.append(array) arrays.append(array)
...@@ -93,7 +93,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): ...@@ -93,7 +93,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model # which are not required for using pretrained model
if any(n in ["adam_v", "adam_m", "global_step"] for n in name): if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
print("Skipping {}".format("/".join(name))) logger.info("Skipping {}".format("/".join(name)))
continue continue
pointer = model pointer = model
for m_name in name: for m_name in name:
...@@ -113,7 +113,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): ...@@ -113,7 +113,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
try: try:
pointer = getattr(pointer, l[0]) pointer = getattr(pointer, l[0])
except AttributeError: except AttributeError:
print("Skipping {}".format("/".join(name))) logger.info("Skipping {}".format("/".join(name)))
continue continue
if len(l) >= 2: if len(l) >= 2:
num = int(l[1]) num = int(l[1])
...@@ -127,7 +127,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): ...@@ -127,7 +127,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
......
...@@ -49,17 +49,17 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): ...@@ -49,17 +49,17 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.") "https://www.tensorflow.org/install/ for installation instructions.")
raise raise
tf_path = os.path.abspath(gpt2_checkpoint_path) tf_path = os.path.abspath(gpt2_checkpoint_path)
print("Converting TensorFlow checkpoint from {}".format(tf_path)) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model # Load weights from TF model
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
names = [] names = []
arrays = [] arrays = []
for name, shape in init_vars: for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape)) logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
names.append(name) names.append(name)
arrays.append(array.squeeze()) arrays.append(array.squeeze())
...@@ -90,7 +90,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): ...@@ -90,7 +90,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
......
...@@ -110,7 +110,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): ...@@ -110,7 +110,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
......
...@@ -126,7 +126,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -126,7 +126,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.") "https://www.tensorflow.org/install/ for installation instructions.")
raise raise
# Build TF to PyTorch weights loading map # Build TF to PyTorch weights loading map
...@@ -136,7 +136,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -136,7 +136,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
tf_weights = {} tf_weights = {}
for name, shape in init_vars: for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape)) logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array tf_weights[name] = array
...@@ -157,7 +157,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -157,7 +157,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (p_i.shape, arr_i.shape) e.args += (p_i.shape, arr_i.shape)
raise raise
print("Initialize PyTorch weight {} for layer {}".format(name, i)) logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
p_i.data = torch.from_numpy(arr_i) p_i.data = torch.from_numpy(arr_i)
else: else:
try: try:
...@@ -165,13 +165,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -165,13 +165,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None) tf_weights.pop(name, None)
tf_weights.pop(name + '/Adam', None) tf_weights.pop(name + '/Adam', None)
tf_weights.pop(name + '/Adam_1', None) tf_weights.pop(name + '/Adam_1', None)
print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
return model return model
......
...@@ -272,7 +272,6 @@ class LogUniformSampler(object): ...@@ -272,7 +272,6 @@ class LogUniformSampler(object):
self.range_max = range_max self.range_max = range_max
log_indices = torch.arange(1., range_max+2., 1.).log_() log_indices = torch.arange(1., range_max+2., 1.).log_()
self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
# print('P', self.dist.numpy().tolist()[-30:])
self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
...@@ -331,72 +330,3 @@ def sample_logits(embedding, bias, labels, inputs, sampler): ...@@ -331,72 +330,3 @@ def sample_logits(embedding, bias, labels, inputs, sampler):
logits = torch.cat([true_logits[:, :, None], sample_logits], -1) logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
return logits return logits
# class LogUniformSampler(object):
# def __init__(self, range_max, unique=False):
# """
# Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
# `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
# """
# self.range_max = range_max
# log_indices = torch.arange(1., range_max+2., 1.).log_()
# self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
# self.unique = unique
# if self.unique:
# self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
# def sample(self, n_sample, labels):
# pos_sample, new_labels = labels.unique(return_inverse=True)
# n_pos_sample = pos_sample.size(0)
# n_neg_sample = n_sample - n_pos_sample
# if self.unique:
# self.exclude_mask.index_fill_(0, pos_sample, 1)
# sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
# self.exclude_mask.index_fill_(0, pos_sample, 0)
# else:
# sample_dist = self.dist
# neg_sample = torch.multinomial(sample_dist, n_neg_sample)
# sample = torch.cat([pos_sample, neg_sample])
# sample_prob = self.dist[sample]
# return new_labels, sample, sample_prob
if __name__ == '__main__':
S, B = 3, 4
n_vocab = 10000
n_sample = 5
H = 32
labels = torch.LongTensor(S, B).random_(0, n_vocab)
# sampler = LogUniformSampler(n_vocab, unique=False)
# new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
# true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
# print('true_probs', true_probs.numpy().tolist())
# print('samp_probs', samp_probs.numpy().tolist())
# print('neg_samples', neg_samples.numpy().tolist())
# print('sum', torch.sum(sampler.dist).item())
# assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
embedding = nn.Embedding(n_vocab, H)
bias = torch.zeros(n_vocab)
inputs = torch.Tensor(S, B, H).normal_()
logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
print('logits', logits.detach().numpy().tolist())
print('logits shape', logits.size())
print('out_labels', out_labels.detach().numpy().tolist())
print('out_labels shape', out_labels.size())
...@@ -57,16 +57,18 @@ class PretrainedConfig(object): ...@@ -57,16 +57,18 @@ class PretrainedConfig(object):
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of: - a str with the name of a pre-trained model to load selected in the list of:
. `xlnet-large-cased` . `xlnet-large-cased`
- a path or url to a pretrained model archive containing: - a path or url to a directory containing a configuration file `config.json` for the model,
. `config.json` a configuration file for the model - a path or url to a configuration file for the model.
cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached. cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached.
""" """
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
if pretrained_model_name_or_path in cls.pretrained_config_archive_map: if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
else: elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
else:
config_file = pretrained_model_name_or_path
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_config_file = cached_path(config_file, cache_dir=cache_dir) resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
...@@ -102,7 +104,7 @@ class PretrainedConfig(object): ...@@ -102,7 +104,7 @@ class PretrainedConfig(object):
for key in to_remove: for key in to_remove:
kwargs.pop(key, None) kwargs.pop(key, None)
logger.info("Model config {}".format(config)) logger.info("Model config %s", config)
return config return config
@classmethod @classmethod
...@@ -200,6 +202,7 @@ class PreTrainedModel(nn.Module): ...@@ -200,6 +202,7 @@ class PreTrainedModel(nn.Module):
- a path or url to a tensorflow pretrained model checkpoint containing: - a path or url to a tensorflow pretrained model checkpoint containing:
. `config.json` a configuration file for the model . `config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint . `model.chkpt` a TensorFlow checkpoint
config: an optional configuration for the model
from_tf: should we load the weights from a locally saved TensorFlow checkpoint from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached. cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use state_dict: an optional state dictionnary (collections.OrderedDict object) to use
...@@ -207,23 +210,31 @@ class PreTrainedModel(nn.Module): ...@@ -207,23 +210,31 @@ class PreTrainedModel(nn.Module):
*inputs, **kwargs: additional input for the specific XLNet class *inputs, **kwargs: additional input for the specific XLNet class
(ex: num_labels for XLNetForSequenceClassification) (ex: num_labels for XLNetForSequenceClassification)
""" """
config = kwargs.pop('config', None)
state_dict = kwargs.pop('state_dict', None) state_dict = kwargs.pop('state_dict', None)
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
from_tf = kwargs.pop('from_tf', False) from_tf = kwargs.pop('from_tf', False)
output_loading_info = kwargs.pop('output_loading_info', False) output_loading_info = kwargs.pop('output_loading_info', False)
# Load config # Load config
config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) if config is None:
config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# Load model # Load model
if pretrained_model_name_or_path in cls.pretrained_model_archive_map: if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path] archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
else: elif os.path.isdir(pretrained_model_name_or_path):
if from_tf: if from_tf:
# Directly load from a TensorFlow checkpoint # Directly load from a TensorFlow checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
else: else:
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
if from_tf:
# Directly load from a TensorFlow checkpoint
archive_file = pretrained_model_name_or_path + ".index"
else:
archive_file = pretrained_model_name_or_path
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
......
...@@ -122,14 +122,14 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -122,14 +122,14 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.") "https://www.tensorflow.org/install/ for installation instructions.")
raise raise
# Load weights from TF model # Load weights from TF model
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
tf_weights = {} tf_weights = {}
for name, shape in init_vars: for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape)) logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array tf_weights[name] = array
...@@ -137,15 +137,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -137,15 +137,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights) tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
for name, pointer in tf_to_pt_map.items(): for name, pointer in tf_to_pt_map.items():
print("Importing {}".format(name)) logger.info("Importing {}".format(name))
if name not in tf_weights: if name not in tf_weights:
print("{} not in tf pre-trained weights, skipping".format(name)) logger.info("{} not in tf pre-trained weights, skipping".format(name))
continue continue
array = tf_weights[name] array = tf_weights[name]
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model # which are not required for using pretrained model
if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name): if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
print("Transposing") logger.info("Transposing")
array = np.transpose(array) array = np.transpose(array)
if isinstance(pointer, list): if isinstance(pointer, list):
# Here we will split the TF weigths # Here we will split the TF weigths
...@@ -157,7 +157,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -157,7 +157,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (p_i.shape, arr_i.shape) e.args += (p_i.shape, arr_i.shape)
raise raise
print("Initialize PyTorch weight {} for layer {}".format(name, i)) logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
p_i.data = torch.from_numpy(arr_i) p_i.data = torch.from_numpy(arr_i)
else: else:
try: try:
...@@ -165,13 +165,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -165,13 +165,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None) tf_weights.pop(name, None)
tf_weights.pop(name + '/Adam', None) tf_weights.pop(name + '/Adam', None)
tf_weights.pop(name + '/Adam_1', None) tf_weights.pop(name + '/Adam_1', None)
print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
return model return model
...@@ -211,10 +211,6 @@ class XLNetConfig(PretrainedConfig): ...@@ -211,10 +211,6 @@ class XLNetConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
dropout=0.1, dropout=0.1,
dropatt=0.1,
init="normal",
init_range=0.1,
init_std=0.02,
mem_len=None, mem_len=None,
reuse_len=None, reuse_len=None,
bi_data=False, bi_data=False,
...@@ -258,11 +254,6 @@ class XLNetConfig(PretrainedConfig): ...@@ -258,11 +254,6 @@ class XLNetConfig(PretrainedConfig):
dropout: float, dropout rate. dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities. dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
init_std: float, initialize the parameters with a normal distribution
with mean 0 and stddev init_std. Only effective when init="normal".
mem_len: int, the number of tokens to cache. mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future. and reused in the future.
...@@ -297,11 +288,7 @@ class XLNetConfig(PretrainedConfig): ...@@ -297,11 +288,7 @@ class XLNetConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.init = init
self.init_range = init_range
self.init_std = init_std
self.dropout = dropout self.dropout = dropout
self.dropatt = dropatt
self.mem_len = mem_len self.mem_len = mem_len
self.reuse_len = reuse_len self.reuse_len = reuse_len
self.bi_data = bi_data self.bi_data = bi_data
...@@ -393,7 +380,7 @@ class XLNetRelativeAttention(nn.Module): ...@@ -393,7 +380,7 @@ class XLNetRelativeAttention(nn.Module):
x = x[1:, ...] x = x[1:, ...]
x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3]) x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
# x = x[:, 0:klen, :, :] # x = x[:, 0:klen, :, :]
x = torch.index_select(x, 1, torch.arange(klen)) x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
return x return x
......
...@@ -14,174 +14,92 @@ ...@@ -14,174 +14,92 @@
# limitations under the License. # limitations under the License.
"""PyTorch optimization for BERT model.""" """PyTorch optimization for BERT model."""
import logging
import math import math
import torch import torch
from torch.optim import Optimizer from torch.optim import Optimizer
from torch.optim.optimizer import required from torch.optim.lr_scheduler import LambdaLR
from torch.nn.utils import clip_grad_norm_
import logging
import abc
import sys
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ConstantLRSchedule(LambdaLR):
def __init__(self, optimizer, last_epoch=-1):
super(ConstantLRSchedule, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
if sys.version_info >= (3, 4): class WarmupCosineSchedule(LambdaLR):
ABC = abc.ABC
else:
ABC = abc.ABCMeta('ABC', (), {})
class _LRSchedule(ABC):
""" Parent of all LRSchedules here. """
warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense
def __init__(self, warmup=0.002, t_total=-1, **kw):
"""
:param warmup: what fraction of t_total steps will be used for linear warmup
:param t_total: how many training steps (updates) are planned
:param kw:
"""
super(_LRSchedule, self).__init__(**kw)
if t_total < 0:
logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
if not 0.0 <= warmup < 1.0 and not warmup == -1:
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
warmup = max(warmup, 0.)
self.warmup, self.t_total = float(warmup), float(t_total)
self.warned_for_t_total_at_progress = -1
def get_lr(self, step, nowarn=False):
"""
:param step: which of t_total steps we're on
:param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps
:return: learning rate multiplier for current update
"""
if self.t_total < 0:
return 1.
progress = float(step) / self.t_total
ret = self.get_lr_(progress)
# warning for exceeding t_total (only active with warmup_linear
if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
logger.warning(
"Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
.format(ret, self.__class__.__name__))
self.warned_for_t_total_at_progress = progress
# end warning
return ret
@abc.abstractmethod
def get_lr_(self, progress):
"""
:param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress
:return: learning rate multiplier for current update
"""
return 1.
class ConstantLR(_LRSchedule):
def get_lr_(self, progress):
return 1.
class WarmupCosineSchedule(_LRSchedule):
""" """
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. Linearly increases learning rate from 0 to 1 over `warmup` training steps.
Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve. Decreases learning rate from 1. to 0. over remaining `t_total - warmup` steps following a cosine curve.
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
:param warmup: see LRSchedule
:param t_total: see LRSchedule
:param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
:param kw:
""" """
warn_t_total = True warn_t_total = True
def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw): def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
"""
:param warmup: see LRSchedule
:param t_total: see LRSchedule
:param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
:param kw:
"""
super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
self.cycles = cycles
def get_lr_(self, progress): def lr_lambda(step):
if progress < self.warmup: if step < warmup_steps:
return progress / self.warmup return step / max(1, warmup_steps)
else: else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup progress = (step - warmup_steps) / max(1, t_total - warmup_steps) # progress after warmup
return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) return 0.5 * (1. + math.cos(math.pi * cycles * 2 * progress))
super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule): class WarmupCosineWithHardRestartsSchedule(LambdaLR):
""" """
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
learning rate (with hard restarts). learning rate (with hard restarts).
""" """
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
assert(cycles >= 1.)
def get_lr_(self, progress): def lr_lambda(step):
if progress < self.warmup: if step < warmup_steps:
return progress / self.warmup return step / max(1, warmup_steps)
else: else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup progress = (step - warmup_steps) / max(1, t_total - warmup_steps) # progress after warmup
ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1))) ret = 0.5 * (1. + math.cos(math.pi * ((cycles * progress) % 1)))
return ret return ret
super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
""" class WarmupConstantSchedule(LambdaLR):
All training progress is divided in `cycles` (default=1.) parts of equal length.
Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
followed by a learning rate decreasing from 1. to 0. following a cosine curve.
"""
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
assert(warmup * cycles < 1.)
warmup = warmup * cycles if warmup >= 0 else warmup
super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
def get_lr_(self, progress):
progress = progress * self.cycles % 1.
if progress < self.warmup:
return progress / self.warmup
else:
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
ret = 0.5 * (1. + math.cos(math.pi * progress))
return ret
class WarmupConstantSchedule(_LRSchedule):
""" """
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
Keeps learning rate equal to 1. after warmup. Keeps learning rate equal to 1. after warmup.
""" """
def get_lr_(self, progress): def __init__(self, optimizer, warmup_steps, last_epoch=-1):
if progress < self.warmup:
return progress / self.warmup def lr_lambda(step):
return 1. if step < warmup_steps:
return step / warmup_steps
return 1.
super(WarmupConstantSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupLinearSchedule(_LRSchedule):
class WarmupLinearSchedule(LambdaLR):
""" """
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps. Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
""" """
warn_t_total = True def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
def get_lr_(self, progress):
if progress < self.warmup:
return progress / self.warmup
return max((progress - 1.) / (self.warmup - 1.), 0.)
def lr_lambda(step):
if step < warmup_steps:
return step / max(1, warmup_steps)
return (t_total - step) / max(1, t_total - warmup_steps)
SCHEDULES = { super(WarmupLinearSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
None: ConstantLR,
"none": ConstantLR,
"warmup_cosine": WarmupCosineSchedule,
"warmup_constant": WarmupConstantSchedule,
"warmup_linear": WarmupLinearSchedule
}
class BertAdam(Optimizer): class AdamW(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix. """ Implements Adam algorithm with weight decay fix.
Parameters: Parameters:
lr: learning rate lr: learning rate
...@@ -197,43 +115,20 @@ class BertAdam(Optimizer): ...@@ -197,43 +115,20 @@ class BertAdam(Optimizer):
e: Adams epsilon. Default: 1e-6 e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01 weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
correct_bias: can be set to False to avoid correcting bias in Adam (e.g. like in Bert repository)
""" """
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01, correct_bias=True):
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs): if lr < 0.0:
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES: if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid schedule parameter: {}".format(schedule)) raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
if not 0.0 <= b1 < 1.0: if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1] ))
if not 0.0 <= b2 < 1.0: if not 0.0 <= eps:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
# initialize schedule object defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
if not isinstance(schedule, _LRSchedule): correct_bias=correct_bias)
schedule_type = SCHEDULES[schedule] super(AdamW, self).__init__(params, defaults)
schedule = schedule_type(warmup=warmup, t_total=t_total)
else:
if warmup != -1 or t_total != -1:
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object.")
defaults = dict(lr=lr, schedule=schedule,
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
super(BertAdam, self).__init__(params, defaults)
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
lr.append(lr_scheduled)
return lr
def step(self, closure=None): def step(self, closure=None):
"""Performs a single optimization step. """Performs a single optimization step.
...@@ -260,22 +155,28 @@ class BertAdam(Optimizer): ...@@ -260,22 +155,28 @@ class BertAdam(Optimizer):
if len(state) == 0: if len(state) == 0:
state['step'] = 0 state['step'] = 0
# Exponential moving average of gradient values # Exponential moving average of gradient values
state['next_m'] = torch.zeros_like(p.data) state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values # Exponential moving average of squared gradient values
state['next_v'] = torch.zeros_like(p.data) state['exp_avg_sq'] = torch.zeros_like(p.data)
next_m, next_v = state['next_m'], state['next_v'] exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['b1'], group['b2'] beta1, beta2 = group['betas']
# Add grad clipping state['step'] += 1
if group['max_grad_norm'] > 0:
clip_grad_norm_(p, group['max_grad_norm'])
# Decay the first and second moment running average coefficient # Decay the first and second moment running average coefficient
# In-place operations to update the averages at the same time # In-place operations to update the averages at the same time
next_m.mul_(beta1).add_(1 - beta1, grad) exp_avg.mul_(beta1).add_(1 - beta1, grad)
next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
update = next_m / (next_v.sqrt() + group['e']) denom = exp_avg_sq.sqrt().add_(group['eps'])
step_size = group['lr']
if group['correct_bias']: # No bias correction for Bert
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
# Just adding the square of the weights to the loss function is *not* # Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam, # the correct way of using L2 regularization/weight decay with Adam,
...@@ -284,20 +185,8 @@ class BertAdam(Optimizer): ...@@ -284,20 +185,8 @@ class BertAdam(Optimizer):
# Instead we want to decay the weights in a manner that doesn't interact # Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square # with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD. # of the weights to the loss with plain (non-momentum) SGD.
if group['weight_decay'] > 0.0: # Add weight decay at the end (fixed version)
update += group['weight_decay'] * p.data if group['weight_decay'] > 0:
p.data.add_(-group['lr'] * group['weight_decay'], p.data)
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
update_with_lr = lr_scheduled * update
p.data.add_(-update_with_lr)
state['step'] += 1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# No bias correction
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']
return loss return loss
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for OpenAI GPT model."""
import math
import torch
from torch.optim import Optimizer
from torch.optim.optimizer import required
from torch.nn.utils import clip_grad_norm_
import logging
from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \
WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule
logger = logging.getLogger(__name__)
class OpenAIAdam(Optimizer):
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""
def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
vector_l2=False, max_grad_norm=-1, **kwargs):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
# initialize schedule object
if not isinstance(schedule, _LRSchedule):
schedule_type = SCHEDULES[schedule]
schedule = schedule_type(warmup=warmup, t_total=t_total)
else:
if warmup != -1 or t_total != -1:
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object.")
defaults = dict(lr=lr, schedule=schedule,
b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
max_grad_norm=max_grad_norm)
super(OpenAIAdam, self).__init__(params, defaults)
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
lr.append(lr_scheduled)
return lr
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['b1'], group['b2']
state['step'] += 1
# Add grad clipping
if group['max_grad_norm'] > 0:
clip_grad_norm_(p, group['max_grad_norm'])
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['e'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
lr_scheduled = group['lr']
lr_scheduled *= group['schedule'].get_lr(state['step'])
step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
# Add weight decay at the end (fixed version)
if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
return loss
...@@ -20,10 +20,9 @@ import unittest ...@@ -20,10 +20,9 @@ import unittest
import torch import torch
from pytorch_transformers import BertAdam from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
from pytorch_transformers import OpenAIAdam WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
from pytorch_transformers.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
import numpy as np import numpy as np
...@@ -34,12 +33,12 @@ class OptimizationTest(unittest.TestCase): ...@@ -34,12 +33,12 @@ class OptimizationTest(unittest.TestCase):
for a, b in zip(list1, list2): for a, b in zip(list1, list2):
self.assertAlmostEqual(a, b, delta=tol) self.assertAlmostEqual(a, b, delta=tol)
def test_adam(self): def test_adam_w(self):
w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
target = torch.tensor([0.4, 0.2, -0.5]) target = torch.tensor([0.4, 0.2, -0.5])
criterion = torch.nn.MSELoss() criterion = torch.nn.MSELoss()
# No warmup, constant schedule, no gradient clipping # No warmup, constant schedule, no gradient clipping
optimizer = BertAdam(params=[w], lr=2e-1, optimizer = AdamW(params=[w], lr=2e-1,
weight_decay=0.0, weight_decay=0.0,
max_grad_norm=-1) max_grad_norm=-1)
for _ in range(100): for _ in range(100):
...@@ -52,23 +51,13 @@ class OptimizationTest(unittest.TestCase): ...@@ -52,23 +51,13 @@ class OptimizationTest(unittest.TestCase):
class ScheduleInitTest(unittest.TestCase): class ScheduleInitTest(unittest.TestCase):
def test_bert_sched_init(self): def test_sched_init(self):
m = torch.nn.Linear(50, 50)
optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
# shouldn't fail
def test_openai_sched_init(self):
m = torch.nn.Linear(50, 50) m = torch.nn.Linear(50, 50)
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None) optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR)) self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none") optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR)) self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000) optim = AdamW(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule)) self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
# shouldn't fail # shouldn't fail
......
...@@ -98,14 +98,14 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -98,14 +98,14 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
self.build_vocab() self.build_vocab()
def count_file(self, path, verbose=False, add_eos=False): def count_file(self, path, verbose=False, add_eos=False):
if verbose: print('counting file {} ...'.format(path)) if verbose: logger.info('counting file {} ...'.format(path))
assert os.path.exists(path) assert os.path.exists(path)
sents = [] sents = []
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f): for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx)) logger.info(' line {}'.format(idx))
symbols = self.tokenize(line, add_eos=add_eos) symbols = self.tokenize(line, add_eos=add_eos)
self.counter.update(symbols) self.counter.update(symbols)
sents.append(symbols) sents.append(symbols)
...@@ -116,10 +116,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -116,10 +116,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
""" """
sents : a list of sentences, each a list of tokenized symbols sents : a list of sentences, each a list of tokenized symbols
""" """
if verbose: print('counting {} sents ...'.format(len(sents))) if verbose: logger.info('counting {} sents ...'.format(len(sents)))
for idx, symbols in enumerate(sents): for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx)) logger.info(' line {}'.format(idx))
self.counter.update(symbols) self.counter.update(symbols)
def _build_from_file(self, vocab_file): def _build_from_file(self, vocab_file):
...@@ -147,11 +147,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -147,11 +147,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def build_vocab(self): def build_vocab(self):
if self.vocab_file: if self.vocab_file:
print('building vocab from {}'.format(self.vocab_file)) logger.info('building vocab from {}'.format(self.vocab_file))
self._build_from_file(self.vocab_file) self._build_from_file(self.vocab_file)
print('final vocab size {}'.format(len(self))) logger.info('final vocab size {}'.format(len(self)))
else: else:
print('building vocab with min_freq={}, max_size={}'.format( logger.info('building vocab with min_freq={}, max_size={}'.format(
self.min_freq, self.max_size)) self.min_freq, self.max_size))
self.idx2sym = [] self.idx2sym = []
self.sym2idx = OrderedDict() self.sym2idx = OrderedDict()
...@@ -163,18 +163,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -163,18 +163,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if cnt < self.min_freq: break if cnt < self.min_freq: break
self.add_symbol(sym) self.add_symbol(sym)
print('final vocab size {} from {} unique tokens'.format( logger.info('final vocab size {} from {} unique tokens'.format(
len(self), len(self.counter))) len(self), len(self.counter)))
def encode_file(self, path, ordered=False, verbose=False, add_eos=True, def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
add_double_eos=False): add_double_eos=False):
if verbose: print('encoding file {} ...'.format(path)) if verbose: logger.info('encoding file {} ...'.format(path))
assert os.path.exists(path) assert os.path.exists(path)
encoded = [] encoded = []
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f): for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx)) logger.info(' line {}'.format(idx))
symbols = self.tokenize(line, add_eos=add_eos, symbols = self.tokenize(line, add_eos=add_eos,
add_double_eos=add_double_eos) add_double_eos=add_double_eos)
encoded.append(self.convert_to_tensor(symbols)) encoded.append(self.convert_to_tensor(symbols))
...@@ -185,11 +185,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -185,11 +185,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return encoded return encoded
def encode_sents(self, sents, ordered=False, verbose=False): def encode_sents(self, sents, ordered=False, verbose=False):
if verbose: print('encoding {} sents ...'.format(len(sents))) if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
encoded = [] encoded = []
for idx, symbols in enumerate(sents): for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx)) logger.info(' line {}'.format(idx))
encoded.append(self.convert_to_tensor(symbols)) encoded.append(self.convert_to_tensor(symbols))
if ordered: if ordered:
...@@ -218,7 +218,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -218,7 +218,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if sym in self.sym2idx: if sym in self.sym2idx:
return self.sym2idx[sym] return self.sym2idx[sym]
else: else:
# print('encounter unk {}'.format(sym)) # logger.info('encounter unk {}'.format(sym))
# assert '<eos>' not in sym # assert '<eos>' not in sym
if hasattr(self, 'unk_idx'): if hasattr(self, 'unk_idx'):
return self.sym2idx.get(sym, self.unk_idx) return self.sym2idx.get(sym, self.unk_idx)
...@@ -544,14 +544,14 @@ def get_lm_corpus(datadir, dataset): ...@@ -544,14 +544,14 @@ def get_lm_corpus(datadir, dataset):
fn = os.path.join(datadir, 'cache.pt') fn = os.path.join(datadir, 'cache.pt')
fn_pickle = os.path.join(datadir, 'cache.pkl') fn_pickle = os.path.join(datadir, 'cache.pkl')
if os.path.exists(fn): if os.path.exists(fn):
print('Loading cached dataset...') logger.info('Loading cached dataset...')
corpus = torch.load(fn_pickle) corpus = torch.load(fn_pickle)
elif os.path.exists(fn): elif os.path.exists(fn):
print('Loading cached dataset from pickle...') logger.info('Loading cached dataset from pickle...')
with open(fn, "rb") as fp: with open(fn, "rb") as fp:
corpus = pickle.load(fp) corpus = pickle.load(fp)
else: else:
print('Producing dataset {}...'.format(dataset)) logger.info('Producing dataset {}...'.format(dataset))
kwargs = {} kwargs = {}
if dataset in ['wt103', 'wt2']: if dataset in ['wt103', 'wt2']:
kwargs['special'] = ['<eos>'] kwargs['special'] = ['<eos>']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment