"...resnet50_tensorflow.git" did not exist on "98965cc42ca630024e402582e109c2d534cf73f2"
Unverified Commit 3c1b6f59 authored by Julien Chaumond's avatar Julien Chaumond Committed by GitHub
Browse files

Merge branch 'master' into fix_top_k_top_p_filtering

parents a9f24a16 fa735208
...@@ -14,14 +14,15 @@ ...@@ -14,14 +14,15 @@
# limitations under the License. # limitations under the License.
""" """
Preprocessing script before training DistilBERT. Preprocessing script before training DistilBERT.
Specific to BERT -> DistilBERT.
""" """
from pytorch_transformers import BertForMaskedLM, RobertaForMaskedLM from transformers import BertForMaskedLM, RobertaForMaskedLM
import torch import torch
import argparse import argparse
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation") parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
parser.add_argument("--model_type", default="bert", choices=["bert", "roberta"]) parser.add_argument("--model_type", default="bert", choices=["bert"])
parser.add_argument("--model_name", default='bert-base-uncased', type=str) parser.add_argument("--model_name", default='bert-base-uncased', type=str)
parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str) parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
parser.add_argument("--vocab_transform", action='store_true') parser.add_argument("--vocab_transform", action='store_true')
...@@ -31,9 +32,8 @@ if __name__ == '__main__': ...@@ -31,9 +32,8 @@ if __name__ == '__main__':
if args.model_type == 'bert': if args.model_type == 'bert':
model = BertForMaskedLM.from_pretrained(args.model_name) model = BertForMaskedLM.from_pretrained(args.model_name)
prefix = 'bert' prefix = 'bert'
elif args.model_type == 'roberta': else:
model = RobertaForMaskedLM.from_pretrained(args.model_name) raise ValueError(f'args.model_type should be "bert".')
prefix = 'roberta'
state_dict = model.state_dict() state_dict = model.state_dict()
compressed_sd = {} compressed_sd = {}
...@@ -68,20 +68,12 @@ if __name__ == '__main__': ...@@ -68,20 +68,12 @@ if __name__ == '__main__':
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}'] state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
std_idx += 1 std_idx += 1
if args.model_type == 'bert': compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight'] compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias'] if args.vocab_transform:
if args.vocab_transform: for w in ['weight', 'bias']:
for w in ['weight', 'bias']: compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}'] compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
elif args.model_type == 'roberta':
compressed_sd[f'vocab_projector.weight'] = state_dict[f'lm_head.decoder.weight']
compressed_sd[f'vocab_projector.bias'] = state_dict[f'lm_head.bias']
if args.vocab_transform:
for w in ['weight', 'bias']:
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'lm_head.dense.{w}']
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
print(f'N layers selected for distillation: {std_idx}') print(f'N layers selected for distillation: {std_idx}')
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}') print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Preprocessing script before training DistilBERT. Preprocessing script before training the distilled model.
""" """
from collections import Counter from collections import Counter
import argparse import argparse
......
This diff is collapsed.
{
"activation": "gelu",
"attention_dropout": 0.1,
"dim": 768,
"dropout": 0.1,
"hidden_dim": 3072,
"initializer_range": 0.02,
"max_position_embeddings": 512,
"n_heads": 12,
"n_layers": 6,
"sinusoidal_pos_embds": true,
"tie_weights_": true,
"vocab_size": 30522
}
\ No newline at end of file
{
"initializer_range": 0.02,
"layer_norm_epsilon": 0.00001,
"n_ctx": 1024,
"n_embd": 768,
"n_head": 12,
"n_layer": 6,
"n_positions": 1024,
"vocab_size": 50257
}
\ No newline at end of file
tensorboardX tensorboardX
scikit-learn tensorboard
\ No newline at end of file scikit-learn
seqeval
...@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse ...@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from pytorch_transformers import (WEIGHTS_NAME, from transformers import (WEIGHTS_NAME,
BertConfig, BertForSequenceClassification, BertTokenizer, BertConfig, BertForSequenceClassification, BertTokenizer,
XLMConfig, XLMForSequenceClassification, XLMTokenizer, XLMConfig, XLMForSequenceClassification, XLMTokenizer,
XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer) XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -29,16 +29,21 @@ import torch ...@@ -29,16 +29,21 @@ import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset) TensorDataset)
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from tensorboardX import SummaryWriter
try:
from torch.utils.tensorboard import SummaryWriter
except:
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange from tqdm import tqdm, trange
from pytorch_transformers import (WEIGHTS_NAME, BertConfig, from transformers import (WEIGHTS_NAME, BertConfig,
BertForMultipleChoice, BertTokenizer, BertForMultipleChoice, BertTokenizer,
XLNetConfig, XLNetForMultipleChoice, XLNetConfig, XLNetForMultipleChoice,
XLNetTokenizer, RobertaConfig, XLNetTokenizer, RobertaConfig,
RobertaForMultipleChoice, RobertaTokenizer) RobertaForMultipleChoice, RobertaTokenizer)
from pytorch_transformers import AdamW, WarmupLinearSchedule from transformers import AdamW, WarmupLinearSchedule
from utils_multiple_choice import (convert_examples_to_features, processors) from utils_multiple_choice import (convert_examples_to_features, processors)
...@@ -141,7 +146,7 @@ def train(args, train_dataset, model, tokenizer): ...@@ -141,7 +146,7 @@ def train(args, train_dataset, model, tokenizer):
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
'labels': batch[3]} 'labels': batch[3]}
outputs = model(**inputs) outputs = model(**inputs)
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] # model outputs are always tuple in transformers (see doc)
if args.n_gpu > 1: if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training loss = loss.mean() # mean() to average on multi-gpu parallel training
...@@ -293,7 +298,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): ...@@ -293,7 +298,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
list(filter(None, args.model_name_or_path.split('/'))).pop(), list(filter(None, args.model_name_or_path.split('/'))).pop(),
str(args.max_seq_length), str(args.max_seq_length),
str(task))) str(task)))
if os.path.exists(cached_features_file): if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file) logger.info("Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file) features = torch.load(cached_features_file)
else: else:
...@@ -306,14 +311,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): ...@@ -306,14 +311,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
else: else:
examples = processor.get_train_examples(args.data_dir) examples = processor.get_train_examples(args.data_dir)
logger.info("Training number: %s", str(len(examples))) logger.info("Training number: %s", str(len(examples)))
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, features = convert_examples_to_features(
cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end examples,
cls_token=tokenizer.cls_token, label_list,
sep_token=tokenizer.sep_token, args.max_seq_length,
sep_token_extra=bool(args.model_type in ['roberta']), tokenizer,
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
)
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file) torch.save(features, cached_features_file)
...@@ -362,7 +367,7 @@ def main(): ...@@ -362,7 +367,7 @@ def main():
help="Whether to run eval on the dev set.") help="Whether to run eval on the dev set.")
parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set') parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
parser.add_argument("--evaluate_during_training", action='store_true', parser.add_argument("--evaluate_during_training", action='store_true',
help="Rul evaluation during training at each logging step.") help="Run evaluation during training at each logging step.")
parser.add_argument("--do_lower_case", action='store_true', parser.add_argument("--do_lower_case", action='store_true',
help="Set this flag if you are using an uncased model.") help="Set this flag if you are using an uncased model.")
...@@ -508,13 +513,15 @@ def main(): ...@@ -508,13 +513,15 @@ def main():
checkpoints = [args.output_dir] checkpoints = [args.output_dir]
if args.eval_all_checkpoints: if args.eval_all_checkpoints:
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
logger.info("Evaluate the following checkpoints: %s", checkpoints) logger.info("Evaluate the following checkpoints: %s", checkpoints)
for checkpoint in checkpoints: for checkpoint in checkpoints:
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
model = model_class.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint)
model.to(args.device) model.to(args.device)
result = evaluate(args, model, tokenizer, prefix=global_step) result = evaluate(args, model, tokenizer, prefix=prefix)
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
results.update(result) results.update(result)
...@@ -524,13 +531,15 @@ def main(): ...@@ -524,13 +531,15 @@ def main():
checkpoints = [args.output_dir] checkpoints = [args.output_dir]
# if args.eval_all_checkpoints: # can not use this to do test!! # if args.eval_all_checkpoints: # can not use this to do test!!
# checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) # checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
# logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
logger.info("Evaluate the following checkpoints: %s", checkpoints) logger.info("Evaluate the following checkpoints: %s", checkpoints)
for checkpoint in checkpoints: for checkpoint in checkpoints:
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
model = model_class.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint)
model.to(args.device) model.to(args.device)
result = evaluate(args, model, tokenizer, prefix=global_step, test=True) result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
results.update(result) results.update(result)
if best_steps: if best_steps:
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -24,7 +24,7 @@ import math ...@@ -24,7 +24,7 @@ import math
import collections import collections
from io import open from io import open
from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method) # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment