Commit 4fef5919 authored by thomwolf's avatar thomwolf
Browse files

updating examples

parent 50b7e52a
...@@ -18,46 +18,37 @@ ...@@ -18,46 +18,37 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import argparse import argparse
import glob
import logging import logging
import os import os
import random import random
from tqdm import tqdm, trange
import numpy as np import numpy as np
import torch import torch
from tensorboardX import SummaryWriter
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset) TensorDataset)
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from tensorboardX import SummaryWriter from pytorch_transformers import WEIGHTS_NAME
from pytorch_transformers import (BertConfig, BertForSequenceClassification,
from pytorch_transformers import (BertForSequenceClassification, XLNetForSequenceClassification, BertTokenizer, XLMConfig,
XLMForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, XLMForSequenceClassification, XLMTokenizer,
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP) XLNetConfig, XLNetForSequenceClassification,
from pytorch_transformers import (BertTokenizer, XLNetTokenizer, XLNetTokenizer)
XLMTokenizer)
from pytorch_transformers.optimization import BertAdam from pytorch_transformers.optimization import BertAdam
from utils_glue import (compute_metrics, convert_examples_to_features,
from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics output_modes, processors)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ALL_MODELS = sum((tuple(m.keys()) for m in (BERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP)), ())
MODEL_CLASSES = { MODEL_CLASSES = {
'bert': BertForSequenceClassification, 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
'xlnet': XLNetForSequenceClassification, 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
'xlm': XLMForSequenceClassification, 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
}
TOKENIZER_CLASSES = {
'bert': BertTokenizer,
'xlnet': XLNetTokenizer,
'xlm': XLMTokenizer,
} }
def train(args, train_dataset, model, tokenizer): def train(args, train_dataset, model, tokenizer):
...@@ -130,14 +121,26 @@ def train(args, train_dataset, model, tokenizer): ...@@ -130,14 +121,26 @@ def train(args, train_dataset, model, tokenizer):
optimizer.step() optimizer.step()
optimizer.zero_grad() optimizer.zero_grad()
global_step += 1 global_step += 1
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
# Log metrics
if args.local_rank == -1: # Only evaluate on single GPU otherwise metrics may not average well if args.local_rank == -1: # Only evaluate on single GPU otherwise metrics may not average well
results = evaluate(args, model, tokenizer) results = evaluate(args, model, tokenizer, prefix=global_step)
for key, value in results.items(): for key, value in results.items():
tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
logging_loss = tr_loss logging_loss = tr_loss
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
if args.max_steps > 0 and global_step > args.max_steps: if args.max_steps > 0 and global_step > args.max_steps:
break break
if args.max_steps > 0 and global_step > args.max_steps: if args.max_steps > 0 and global_step > args.max_steps:
...@@ -146,7 +149,7 @@ def train(args, train_dataset, model, tokenizer): ...@@ -146,7 +149,7 @@ def train(args, train_dataset, model, tokenizer):
return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer): def evaluate(args, model, tokenizer, prefix=""):
# Loop to handle MNLI double evaluation (matched, mis-matched) # Loop to handle MNLI double evaluation (matched, mis-matched)
eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
...@@ -202,7 +205,7 @@ def evaluate(args, model, tokenizer): ...@@ -202,7 +205,7 @@ def evaluate(args, model, tokenizer):
output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer: with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****") logger.info("***** Eval results {} *****".format(prefix))
for key in sorted(result.keys()): for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key])) logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("%s = %s\n" % (key, str(result[key])))
...@@ -264,6 +267,10 @@ def main(): ...@@ -264,6 +267,10 @@ def main():
help="The output directory where the model predictions and checkpoints will be written.") help="The output directory where the model predictions and checkpoints will be written.")
## Other parameters ## Other parameters
parser.add_argument("--config_name", default="", type=str,
help="Pretrained config name or path if not the same as model_name")
parser.add_argument("--tokenizer_name", default="", type=str,
help="Pretrained tokenizer name or path if not the same as model_name")
parser.add_argument("--cache_dir", default="", type=str, parser.add_argument("--cache_dir", default="", type=str,
help="Where do you want to store the pre-trained models downloaded from s3") help="Where do you want to store the pre-trained models downloaded from s3")
parser.add_argument("--max_seq_length", default=128, type=int, parser.add_argument("--max_seq_length", default=128, type=int,
...@@ -293,8 +300,12 @@ def main(): ...@@ -293,8 +300,12 @@ def main():
parser.add_argument("--warmup_proportion", default=0.1, type=float, parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).") help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
parser.add_argument('--logging_steps', type=int, default=100, parser.add_argument('--logging_steps', type=int, default=50,
help="Log every X updates steps.") help="Log every X updates steps.")
parser.add_argument('--save_steps', type=int, default=50,
help="Save checkpoint every X updates steps.")
parser.add_argument("--eval_all_checkpoints", action='store_true',
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
parser.add_argument("--no_cuda", action='store_true', parser.add_argument("--no_cuda", action='store_true',
help="Avoid using CUDA when available") help="Avoid using CUDA when available")
parser.add_argument('--overwrite_output_dir', action='store_true', parser.add_argument('--overwrite_output_dir', action='store_true',
...@@ -363,11 +374,15 @@ def main(): ...@@ -363,11 +374,15 @@ def main():
# Make sure only the first process in distributed training will download model & vocab # Make sure only the first process in distributed training will download model & vocab
torch.distributed.barrier() torch.distributed.barrier()
args.model_type = args.model_name.lower().split('-')[0] args.model_type = ""
tokenizer_class = TOKENIZER_CLASSES[args.model_type] for key in MODEL_CLASSES:
model_class = MODEL_CLASSES[args.model_type] if key in args.model_name.lower():
tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case) args.model_type = key # take the first match in model types
model = model_class.from_pretrained(args.model_name, num_labels=num_labels) break
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name, num_labels=num_labels, finetuning_task=args.task_name)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name, do_lower_case=args.do_lower_case)
model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
if args.local_rank == 0: if args.local_rank == 0:
torch.distributed.barrier() torch.distributed.barrier()
...@@ -410,8 +425,17 @@ def main(): ...@@ -410,8 +425,17 @@ def main():
# Evaluation # Evaluation
if args.do_eval and args.local_rank in [-1, 0]: if args.do_eval and args.local_rank in [-1, 0]:
results = evaluate(args, model, tokenizer) checkpoints = [args.output_dir + './' + WEIGHTS_NAME]
if args.eval_all_checkpoints:
checkpoints = list(os.path.dirname(c) for c in glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))
logger.info("Evaluate the following checkpoints: %s", checkpoints)
results = {}
for checkpoint in checkpoints:
global_step = int(checkpoints.split('-')[-1])
model = model_class.from_pretrained(checkpoints)
model.to(args.device)
result = evaluate(args, model, tokenizer, prefix=global_step)
result = dict(n + '_{}'.format())
return results return results
......
...@@ -21,6 +21,7 @@ import csv ...@@ -21,6 +21,7 @@ import csv
import logging import logging
import os import os
import sys import sys
from io import open
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score from sklearn.metrics import matthews_corrcoef, f1_score
......
...@@ -73,17 +73,17 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): ...@@ -73,17 +73,17 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.") "https://www.tensorflow.org/install/ for installation instructions.")
raise raise
tf_path = os.path.abspath(tf_checkpoint_path) tf_path = os.path.abspath(tf_checkpoint_path)
print("Converting TensorFlow checkpoint from {}".format(tf_path)) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model # Load weights from TF model
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
names = [] names = []
arrays = [] arrays = []
for name, shape in init_vars: for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape)) logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
names.append(name) names.append(name)
arrays.append(array) arrays.append(array)
...@@ -93,7 +93,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): ...@@ -93,7 +93,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model # which are not required for using pretrained model
if any(n in ["adam_v", "adam_m", "global_step"] for n in name): if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
print("Skipping {}".format("/".join(name))) logger.info("Skipping {}".format("/".join(name)))
continue continue
pointer = model pointer = model
for m_name in name: for m_name in name:
...@@ -113,7 +113,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): ...@@ -113,7 +113,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
try: try:
pointer = getattr(pointer, l[0]) pointer = getattr(pointer, l[0])
except AttributeError: except AttributeError:
print("Skipping {}".format("/".join(name))) logger.info("Skipping {}".format("/".join(name)))
continue continue
if len(l) >= 2: if len(l) >= 2:
num = int(l[1]) num = int(l[1])
...@@ -127,7 +127,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): ...@@ -127,7 +127,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
......
...@@ -49,17 +49,17 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): ...@@ -49,17 +49,17 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.") "https://www.tensorflow.org/install/ for installation instructions.")
raise raise
tf_path = os.path.abspath(gpt2_checkpoint_path) tf_path = os.path.abspath(gpt2_checkpoint_path)
print("Converting TensorFlow checkpoint from {}".format(tf_path)) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model # Load weights from TF model
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
names = [] names = []
arrays = [] arrays = []
for name, shape in init_vars: for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape)) logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
names.append(name) names.append(name)
arrays.append(array.squeeze()) arrays.append(array.squeeze())
...@@ -90,7 +90,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): ...@@ -90,7 +90,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
......
...@@ -110,7 +110,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): ...@@ -110,7 +110,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
......
...@@ -126,7 +126,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -126,7 +126,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.") "https://www.tensorflow.org/install/ for installation instructions.")
raise raise
# Build TF to PyTorch weights loading map # Build TF to PyTorch weights loading map
...@@ -136,7 +136,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -136,7 +136,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
tf_weights = {} tf_weights = {}
for name, shape in init_vars: for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape)) logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array tf_weights[name] = array
...@@ -157,7 +157,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -157,7 +157,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (p_i.shape, arr_i.shape) e.args += (p_i.shape, arr_i.shape)
raise raise
print("Initialize PyTorch weight {} for layer {}".format(name, i)) logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
p_i.data = torch.from_numpy(arr_i) p_i.data = torch.from_numpy(arr_i)
else: else:
try: try:
...@@ -165,13 +165,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -165,13 +165,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None) tf_weights.pop(name, None)
tf_weights.pop(name + '/Adam', None) tf_weights.pop(name + '/Adam', None)
tf_weights.pop(name + '/Adam_1', None) tf_weights.pop(name + '/Adam_1', None)
print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
return model return model
......
...@@ -272,7 +272,6 @@ class LogUniformSampler(object): ...@@ -272,7 +272,6 @@ class LogUniformSampler(object):
self.range_max = range_max self.range_max = range_max
log_indices = torch.arange(1., range_max+2., 1.).log_() log_indices = torch.arange(1., range_max+2., 1.).log_()
self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
# print('P', self.dist.numpy().tolist()[-30:])
self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
...@@ -331,72 +330,3 @@ def sample_logits(embedding, bias, labels, inputs, sampler): ...@@ -331,72 +330,3 @@ def sample_logits(embedding, bias, labels, inputs, sampler):
logits = torch.cat([true_logits[:, :, None], sample_logits], -1) logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
return logits return logits
# class LogUniformSampler(object):
# def __init__(self, range_max, unique=False):
# """
# Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
# `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
# """
# self.range_max = range_max
# log_indices = torch.arange(1., range_max+2., 1.).log_()
# self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
# self.unique = unique
# if self.unique:
# self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
# def sample(self, n_sample, labels):
# pos_sample, new_labels = labels.unique(return_inverse=True)
# n_pos_sample = pos_sample.size(0)
# n_neg_sample = n_sample - n_pos_sample
# if self.unique:
# self.exclude_mask.index_fill_(0, pos_sample, 1)
# sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
# self.exclude_mask.index_fill_(0, pos_sample, 0)
# else:
# sample_dist = self.dist
# neg_sample = torch.multinomial(sample_dist, n_neg_sample)
# sample = torch.cat([pos_sample, neg_sample])
# sample_prob = self.dist[sample]
# return new_labels, sample, sample_prob
if __name__ == '__main__':
S, B = 3, 4
n_vocab = 10000
n_sample = 5
H = 32
labels = torch.LongTensor(S, B).random_(0, n_vocab)
# sampler = LogUniformSampler(n_vocab, unique=False)
# new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
# true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
# print('true_probs', true_probs.numpy().tolist())
# print('samp_probs', samp_probs.numpy().tolist())
# print('neg_samples', neg_samples.numpy().tolist())
# print('sum', torch.sum(sampler.dist).item())
# assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
embedding = nn.Embedding(n_vocab, H)
bias = torch.zeros(n_vocab)
inputs = torch.Tensor(S, B, H).normal_()
logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
print('logits', logits.detach().numpy().tolist())
print('logits shape', logits.size())
print('out_labels', out_labels.detach().numpy().tolist())
print('out_labels shape', out_labels.size())
...@@ -57,16 +57,18 @@ class PretrainedConfig(object): ...@@ -57,16 +57,18 @@ class PretrainedConfig(object):
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of: - a str with the name of a pre-trained model to load selected in the list of:
. `xlnet-large-cased` . `xlnet-large-cased`
- a path or url to a pretrained model archive containing: - a path or url to a directory containing a configuration file `config.json` for the model,
. `config.json` a configuration file for the model - a path or url to a configuration file for the model.
cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached. cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached.
""" """
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
if pretrained_model_name_or_path in cls.pretrained_config_archive_map: if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
else: elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
else:
config_file = pretrained_model_name_or_path
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_config_file = cached_path(config_file, cache_dir=cache_dir) resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
...@@ -200,6 +202,7 @@ class PreTrainedModel(nn.Module): ...@@ -200,6 +202,7 @@ class PreTrainedModel(nn.Module):
- a path or url to a tensorflow pretrained model checkpoint containing: - a path or url to a tensorflow pretrained model checkpoint containing:
. `config.json` a configuration file for the model . `config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint . `model.chkpt` a TensorFlow checkpoint
config: an optional configuration for the model
from_tf: should we load the weights from a locally saved TensorFlow checkpoint from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached. cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use state_dict: an optional state dictionnary (collections.OrderedDict object) to use
...@@ -207,23 +210,31 @@ class PreTrainedModel(nn.Module): ...@@ -207,23 +210,31 @@ class PreTrainedModel(nn.Module):
*inputs, **kwargs: additional input for the specific XLNet class *inputs, **kwargs: additional input for the specific XLNet class
(ex: num_labels for XLNetForSequenceClassification) (ex: num_labels for XLNetForSequenceClassification)
""" """
config = kwargs.pop('config', None)
state_dict = kwargs.pop('state_dict', None) state_dict = kwargs.pop('state_dict', None)
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
from_tf = kwargs.pop('from_tf', False) from_tf = kwargs.pop('from_tf', False)
output_loading_info = kwargs.pop('output_loading_info', False) output_loading_info = kwargs.pop('output_loading_info', False)
# Load config # Load config
if config is None:
config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# Load model # Load model
if pretrained_model_name_or_path in cls.pretrained_model_archive_map: if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path] archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
else: elif os.path.isdir(pretrained_model_name_or_path):
if from_tf: if from_tf:
# Directly load from a TensorFlow checkpoint # Directly load from a TensorFlow checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
else: else:
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
if from_tf:
# Directly load from a TensorFlow checkpoint
archive_file = pretrained_model_name_or_path + ".index"
else:
archive_file = pretrained_model_name_or_path
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
......
...@@ -122,14 +122,14 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -122,14 +122,14 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.") "https://www.tensorflow.org/install/ for installation instructions.")
raise raise
# Load weights from TF model # Load weights from TF model
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
tf_weights = {} tf_weights = {}
for name, shape in init_vars: for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape)) logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array tf_weights[name] = array
...@@ -137,15 +137,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -137,15 +137,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights) tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
for name, pointer in tf_to_pt_map.items(): for name, pointer in tf_to_pt_map.items():
print("Importing {}".format(name)) logger.info("Importing {}".format(name))
if name not in tf_weights: if name not in tf_weights:
print("{} not in tf pre-trained weights, skipping".format(name)) logger.info("{} not in tf pre-trained weights, skipping".format(name))
continue continue
array = tf_weights[name] array = tf_weights[name]
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model # which are not required for using pretrained model
if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name): if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
print("Transposing") logger.info("Transposing")
array = np.transpose(array) array = np.transpose(array)
if isinstance(pointer, list): if isinstance(pointer, list):
# Here we will split the TF weigths # Here we will split the TF weigths
...@@ -157,7 +157,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -157,7 +157,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (p_i.shape, arr_i.shape) e.args += (p_i.shape, arr_i.shape)
raise raise
print("Initialize PyTorch weight {} for layer {}".format(name, i)) logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
p_i.data = torch.from_numpy(arr_i) p_i.data = torch.from_numpy(arr_i)
else: else:
try: try:
...@@ -165,13 +165,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -165,13 +165,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
print("Initialize PyTorch weight {}".format(name)) logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None) tf_weights.pop(name, None)
tf_weights.pop(name + '/Adam', None) tf_weights.pop(name + '/Adam', None)
tf_weights.pop(name + '/Adam_1', None) tf_weights.pop(name + '/Adam_1', None)
print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
return model return model
......
...@@ -98,14 +98,14 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -98,14 +98,14 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
self.build_vocab() self.build_vocab()
def count_file(self, path, verbose=False, add_eos=False): def count_file(self, path, verbose=False, add_eos=False):
if verbose: print('counting file {} ...'.format(path)) if verbose: logger.info('counting file {} ...'.format(path))
assert os.path.exists(path) assert os.path.exists(path)
sents = [] sents = []
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f): for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx)) logger.info(' line {}'.format(idx))
symbols = self.tokenize(line, add_eos=add_eos) symbols = self.tokenize(line, add_eos=add_eos)
self.counter.update(symbols) self.counter.update(symbols)
sents.append(symbols) sents.append(symbols)
...@@ -116,10 +116,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -116,10 +116,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
""" """
sents : a list of sentences, each a list of tokenized symbols sents : a list of sentences, each a list of tokenized symbols
""" """
if verbose: print('counting {} sents ...'.format(len(sents))) if verbose: logger.info('counting {} sents ...'.format(len(sents)))
for idx, symbols in enumerate(sents): for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx)) logger.info(' line {}'.format(idx))
self.counter.update(symbols) self.counter.update(symbols)
def _build_from_file(self, vocab_file): def _build_from_file(self, vocab_file):
...@@ -147,11 +147,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -147,11 +147,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def build_vocab(self): def build_vocab(self):
if self.vocab_file: if self.vocab_file:
print('building vocab from {}'.format(self.vocab_file)) logger.info('building vocab from {}'.format(self.vocab_file))
self._build_from_file(self.vocab_file) self._build_from_file(self.vocab_file)
print('final vocab size {}'.format(len(self))) logger.info('final vocab size {}'.format(len(self)))
else: else:
print('building vocab with min_freq={}, max_size={}'.format( logger.info('building vocab with min_freq={}, max_size={}'.format(
self.min_freq, self.max_size)) self.min_freq, self.max_size))
self.idx2sym = [] self.idx2sym = []
self.sym2idx = OrderedDict() self.sym2idx = OrderedDict()
...@@ -163,18 +163,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -163,18 +163,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if cnt < self.min_freq: break if cnt < self.min_freq: break
self.add_symbol(sym) self.add_symbol(sym)
print('final vocab size {} from {} unique tokens'.format( logger.info('final vocab size {} from {} unique tokens'.format(
len(self), len(self.counter))) len(self), len(self.counter)))
def encode_file(self, path, ordered=False, verbose=False, add_eos=True, def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
add_double_eos=False): add_double_eos=False):
if verbose: print('encoding file {} ...'.format(path)) if verbose: logger.info('encoding file {} ...'.format(path))
assert os.path.exists(path) assert os.path.exists(path)
encoded = [] encoded = []
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f): for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx)) logger.info(' line {}'.format(idx))
symbols = self.tokenize(line, add_eos=add_eos, symbols = self.tokenize(line, add_eos=add_eos,
add_double_eos=add_double_eos) add_double_eos=add_double_eos)
encoded.append(self.convert_to_tensor(symbols)) encoded.append(self.convert_to_tensor(symbols))
...@@ -185,11 +185,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -185,11 +185,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return encoded return encoded
def encode_sents(self, sents, ordered=False, verbose=False): def encode_sents(self, sents, ordered=False, verbose=False):
if verbose: print('encoding {} sents ...'.format(len(sents))) if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
encoded = [] encoded = []
for idx, symbols in enumerate(sents): for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
print(' line {}'.format(idx)) logger.info(' line {}'.format(idx))
encoded.append(self.convert_to_tensor(symbols)) encoded.append(self.convert_to_tensor(symbols))
if ordered: if ordered:
...@@ -218,7 +218,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -218,7 +218,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if sym in self.sym2idx: if sym in self.sym2idx:
return self.sym2idx[sym] return self.sym2idx[sym]
else: else:
# print('encounter unk {}'.format(sym)) # logger.info('encounter unk {}'.format(sym))
# assert '<eos>' not in sym # assert '<eos>' not in sym
if hasattr(self, 'unk_idx'): if hasattr(self, 'unk_idx'):
return self.sym2idx.get(sym, self.unk_idx) return self.sym2idx.get(sym, self.unk_idx)
...@@ -544,14 +544,14 @@ def get_lm_corpus(datadir, dataset): ...@@ -544,14 +544,14 @@ def get_lm_corpus(datadir, dataset):
fn = os.path.join(datadir, 'cache.pt') fn = os.path.join(datadir, 'cache.pt')
fn_pickle = os.path.join(datadir, 'cache.pkl') fn_pickle = os.path.join(datadir, 'cache.pkl')
if os.path.exists(fn): if os.path.exists(fn):
print('Loading cached dataset...') logger.info('Loading cached dataset...')
corpus = torch.load(fn_pickle) corpus = torch.load(fn_pickle)
elif os.path.exists(fn): elif os.path.exists(fn):
print('Loading cached dataset from pickle...') logger.info('Loading cached dataset from pickle...')
with open(fn, "rb") as fp: with open(fn, "rb") as fp:
corpus = pickle.load(fp) corpus = pickle.load(fp)
else: else:
print('Producing dataset {}...'.format(dataset)) logger.info('Producing dataset {}...'.format(dataset))
kwargs = {} kwargs = {}
if dataset in ['wt103', 'wt2']: if dataset in ['wt103', 'wt2']:
kwargs['special'] = ['<eos>'] kwargs['special'] = ['<eos>']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment