Commit 1df6f262 authored by thomwolf's avatar thomwolf
Browse files

Merge branch 'fourth-release' of...

Merge branch 'fourth-release' of https://github.com/huggingface/pytorch-pretrained-BERT into fourth-release
parents 770f805a 632f2d2d
...@@ -19,7 +19,7 @@ This implementation is provided with [Google's pre-trained models](https://githu ...@@ -19,7 +19,7 @@ This implementation is provided with [Google's pre-trained models](https://githu
## Installation ## Installation
This repo was tested on Python 3.5+ and PyTorch 0.4.1 This repo was tested on Python 3.6+ and PyTorch 0.4.1
### With pip ### With pip
......
...@@ -35,7 +35,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification ...@@ -35,7 +35,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S', datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO) level = logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -196,9 +196,7 @@ class ColaProcessor(DataProcessor): ...@@ -196,9 +196,7 @@ class ColaProcessor(DataProcessor):
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s.""" """Loads a data file into a list of `InputBatch`s."""
label_map = {} label_map = {label : i for i, label in enumerate(label_list)}
for (i, label) in enumerate(label_list):
label_map[label] = i
features = [] features = []
for (ex_index, example) in enumerate(examples): for (ex_index, example) in enumerate(examples):
...@@ -207,8 +205,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -207,8 +205,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
tokens_b = None tokens_b = None
if example.text_b: if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b) tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total # Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length. # length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3" # Account for [CLS], [SEP], [SEP] with "- 3"
...@@ -216,7 +212,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -216,7 +212,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
else: else:
# Account for [CLS] and [SEP] with "- 2" # Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2: if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)] tokens_a = tokens_a[:(max_seq_length - 2)]
# The convention in BERT is: # The convention in BERT is:
# (a) For sequence pairs: # (a) For sequence pairs:
...@@ -236,22 +232,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -236,22 +232,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
# For classification tasks, the first vector (corresponding to [CLS]) is # For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because # used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned. # the entire model is fine-tuned.
tokens = [] tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
segment_ids = [] segment_ids = [0] * len(tokens)
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b: if tokens_b:
for token in tokens_b: tokens += tokens_b + ["[SEP]"]
tokens.append(token) segment_ids += [1] * (len(tokens_b) + 1)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens)
...@@ -260,10 +246,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -260,10 +246,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
input_mask = [1] * len(input_ids) input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length. # Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length: padding = [0] * (max_seq_length - len(input_ids))
input_ids.append(0) input_ids += padding
input_mask.append(0) input_mask += padding
segment_ids.append(0) segment_ids += padding
assert len(input_ids) == max_seq_length assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length assert len(input_mask) == max_seq_length
...@@ -409,14 +395,14 @@ def main(): ...@@ -409,14 +395,14 @@ def main():
type=int, type=int,
default=-1, default=-1,
help="local_rank for distributed training on gpus") help="local_rank for distributed training on gpus")
parser.add_argument('--seed', parser.add_argument('--seed',
type=int, type=int,
default=42, default=42,
help="random seed for initialization") help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps', parser.add_argument('--gradient_accumulation_steps',
type=int, type=int,
default=1, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.") help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--optimize_on_cpu', parser.add_argument('--optimize_on_cpu',
default=False, default=False,
action='store_true', action='store_true',
...@@ -437,6 +423,12 @@ def main(): ...@@ -437,6 +423,12 @@ def main():
"mrpc": MrpcProcessor, "mrpc": MrpcProcessor,
} }
num_labels_task = {
"cola": 2,
"mnli": 3,
"mrpc": 2,
}
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count() n_gpu = torch.cuda.device_count()
...@@ -475,6 +467,7 @@ def main(): ...@@ -475,6 +467,7 @@ def main():
raise ValueError("Task not found: %s" % (task_name)) raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]() processor = processors[task_name]()
num_labels = num_labels_task[task_name]
label_list = processor.get_labels() label_list = processor.get_labels()
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
...@@ -487,8 +480,9 @@ def main(): ...@@ -487,8 +480,9 @@ def main():
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
# Prepare model # Prepare model
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank) # for distributed learning model = BertForSequenceClassification.from_pretrained(args.bert_model,
model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir) cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
num_labels = num_labels)
if args.fp16: if args.fp16:
model.half() model.half()
model.to(device) model.to(device)
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import math import math
import torch import torch
from torch.optim import Optimizer from torch.optim import Optimizer
from torch.optim.optimizer import required
from torch.nn.utils import clip_grad_norm_ from torch.nn.utils import clip_grad_norm_
def warmup_cosine(x, warmup=0.002): def warmup_cosine(x, warmup=0.002):
...@@ -55,10 +56,10 @@ class BertAdam(Optimizer): ...@@ -55,10 +56,10 @@ class BertAdam(Optimizer):
weight_decay_rate: Weight decay. Default: 0.01 weight_decay_rate: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
""" """
def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear', def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01, b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
max_grad_norm=1.0): max_grad_norm=1.0):
if not lr >= 0.0: if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if schedule not in SCHEDULES: if schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule)) raise ValueError("Invalid schedule parameter: {}".format(schedule))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment