Unverified Commit 146c5212 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Merge branch 'master' into add_models_special_tokens_to_specific_configs

parents f5b50c6b b623ddc0
...@@ -33,6 +33,9 @@ from tqdm import tqdm, trange ...@@ -33,6 +33,9 @@ from tqdm import tqdm, trange
from transformers import ( from transformers import (
WEIGHTS_NAME, WEIGHTS_NAME,
AdamW, AdamW,
AlbertConfig,
AlbertForTokenClassification,
AlbertTokenizer,
BertConfig, BertConfig,
BertForTokenClassification, BertForTokenClassification,
BertTokenizer, BertTokenizer,
...@@ -70,6 +73,7 @@ ALL_MODELS = sum( ...@@ -70,6 +73,7 @@ ALL_MODELS = sum(
) )
MODEL_CLASSES = { MODEL_CLASSES = {
"albert": (AlbertConfig, AlbertForTokenClassification, AlbertTokenizer),
"bert": (BertConfig, BertForTokenClassification, BertTokenizer), "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
"roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
"distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer), "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
...@@ -77,6 +81,8 @@ MODEL_CLASSES = { ...@@ -77,6 +81,8 @@ MODEL_CLASSES = {
"xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer), "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
} }
TOKENIZER_ARGS = ["do_lower_case", "strip_accents", "keep_accents", "use_fast"]
def set_seed(args): def set_seed(args):
random.seed(args.seed) random.seed(args.seed)
...@@ -462,7 +468,13 @@ def main(): ...@@ -462,7 +468,13 @@ def main():
parser.add_argument( parser.add_argument(
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
) )
parser.add_argument(
"--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents."
)
parser.add_argument(
"--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents."
)
parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.")
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument( parser.add_argument(
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
...@@ -545,7 +557,7 @@ def main(): ...@@ -545,7 +557,7 @@ def main():
# Setup CUDA, GPU & distributed training # Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
...@@ -590,10 +602,12 @@ def main(): ...@@ -590,10 +602,12 @@ def main():
label2id={label: i for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)},
cache_dir=args.cache_dir if args.cache_dir else None, cache_dir=args.cache_dir if args.cache_dir else None,
) )
tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS}
logger.info("Tokenizer arguments: %s", tokenizer_args)
tokenizer = tokenizer_class.from_pretrained( tokenizer = tokenizer_class.from_pretrained(
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
do_lower_case=args.do_lower_case,
cache_dir=args.cache_dir if args.cache_dir else None, cache_dir=args.cache_dir if args.cache_dir else None,
**tokenizer_args,
) )
model = model_class.from_pretrained( model = model_class.from_pretrained(
args.model_name_or_path, args.model_name_or_path,
...@@ -636,7 +650,7 @@ def main(): ...@@ -636,7 +650,7 @@ def main():
# Evaluation # Evaluation
results = {} results = {}
if args.do_eval and args.local_rank in [-1, 0]: if args.do_eval and args.local_rank in [-1, 0]:
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args)
checkpoints = [args.output_dir] checkpoints = [args.output_dir]
if args.eval_all_checkpoints: if args.eval_all_checkpoints:
checkpoints = list( checkpoints = list(
...@@ -658,7 +672,7 @@ def main(): ...@@ -658,7 +672,7 @@ def main():
writer.write("{} = {}\n".format(key, str(results[key]))) writer.write("{} = {}\n".format(key, str(results[key])))
if args.do_predict and args.local_rank in [-1, 0]: if args.do_predict and args.local_rank in [-1, 0]:
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args)
model = model_class.from_pretrained(args.output_dir) model = model_class.from_pretrained(args.output_dir)
model.to(args.device) model.to(args.device)
result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
......
# Require pytorch-lightning=0.6 # Install newest ptl.
pip install -U git+http://github.com/PyTorchLightning/pytorch-lightning/
curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
export MAX_LENGTH=128 export MAX_LENGTH=128
export BERT_MODEL=bert-base-multilingual-cased export BERT_MODEL=bert-base-multilingual-cased
python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
export OUTPUT_DIR=germeval-model export OUTPUT_DIR=germeval-model
export BATCH_SIZE=32 export BATCH_SIZE=32
export NUM_EPOCHS=3 export NUM_EPOCHS=3
......
...@@ -7,8 +7,7 @@ import numpy as np ...@@ -7,8 +7,7 @@ import numpy as np
import torch import torch
from seqeval.metrics import f1_score, precision_score, recall_score from seqeval.metrics import f1_score, precision_score, recall_score
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformer_base import BaseTransformer, add_generic_args, generic_train from transformer_base import BaseTransformer, add_generic_args, generic_train
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
...@@ -25,13 +24,14 @@ class NERTransformer(BaseTransformer): ...@@ -25,13 +24,14 @@ class NERTransformer(BaseTransformer):
def __init__(self, hparams): def __init__(self, hparams):
self.labels = get_labels(hparams.labels) self.labels = get_labels(hparams.labels)
num_labels = len(self.labels) num_labels = len(self.labels)
self.pad_token_label_id = CrossEntropyLoss().ignore_index
super(NERTransformer, self).__init__(hparams, num_labels) super(NERTransformer, self).__init__(hparams, num_labels)
def forward(self, **inputs): def forward(self, **inputs):
return self.model(**inputs) return self.model(**inputs)
def training_step(self, batch, batch_num): def training_step(self, batch, batch_num):
"Compute loss" "Compute loss and log."
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
if self.hparams.model_type != "distilbert": if self.hparams.model_type != "distilbert":
inputs["token_type_ids"] = ( inputs["token_type_ids"] = (
...@@ -40,25 +40,61 @@ class NERTransformer(BaseTransformer): ...@@ -40,25 +40,61 @@ class NERTransformer(BaseTransformer):
outputs = self.forward(**inputs) outputs = self.forward(**inputs)
loss = outputs[0] loss = outputs[0]
tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]} tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
return {"loss": loss, "log": tensorboard_logs} return {"loss": loss, "log": tensorboard_logs}
def _feature_file(self, mode):
return os.path.join(
self.hparams.data_dir,
"cached_{}_{}_{}".format(
mode,
list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
str(self.hparams.max_seq_length),
),
)
def prepare_data(self):
"Called to initialize data. Use the call to construct features"
args = self.hparams
for mode in ["train", "dev", "test"]:
cached_features_file = self._feature_file(mode)
if not os.path.exists(cached_features_file):
logger.info("Creating features from dataset file at %s", args.data_dir)
examples = read_examples_from_file(args.data_dir, mode)
features = convert_examples_to_features(
examples,
self.labels,
args.max_seq_length,
self.tokenizer,
cls_token_at_end=bool(args.model_type in ["xlnet"]),
cls_token=self.tokenizer.cls_token,
cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
sep_token=self.tokenizer.sep_token,
sep_token_extra=bool(args.model_type in ["roberta"]),
pad_on_left=bool(args.model_type in ["xlnet"]),
pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
pad_token_label_id=self.pad_token_label_id,
)
logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file)
def load_dataset(self, mode, batch_size): def load_dataset(self, mode, batch_size):
labels = get_labels(self.hparams.labels) "Load datasets. Called after prepare data."
self.pad_token_label_id = CrossEntropyLoss().ignore_index cached_features_file = self._feature_file(mode)
dataset = self.load_and_cache_examples(labels, self.pad_token_label_id, mode) logger.info("Loading features from cached file %s", cached_features_file)
if mode == "train": features = torch.load(cached_features_file)
if self.hparams.n_gpu > 1: all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
sampler = DistributedSampler(dataset) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
else: all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
sampler = RandomSampler(dataset) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
else: return DataLoader(
sampler = SequentialSampler(dataset) TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids), batch_size=batch_size
dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size) )
return dataloader
def validation_step(self, batch, batch_nb): def validation_step(self, batch, batch_nb):
"Compute validation"
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
if self.hparams.model_type != "distilbert": if self.hparams.model_type != "distilbert":
inputs["token_type_ids"] = ( inputs["token_type_ids"] = (
...@@ -68,11 +104,10 @@ class NERTransformer(BaseTransformer): ...@@ -68,11 +104,10 @@ class NERTransformer(BaseTransformer):
tmp_eval_loss, logits = outputs[:2] tmp_eval_loss, logits = outputs[:2]
preds = logits.detach().cpu().numpy() preds = logits.detach().cpu().numpy()
out_label_ids = inputs["labels"].detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy()
return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
return {"val_loss": tmp_eval_loss, "pred": preds, "target": out_label_ids}
def _eval_end(self, outputs): def _eval_end(self, outputs):
"Task specific validation" "Evaluation called for both Val and Test"
val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean() val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
preds = np.concatenate([x["pred"] for x in outputs], axis=0) preds = np.concatenate([x["pred"] for x in outputs], axis=0)
preds = np.argmax(preds, axis=2) preds = np.argmax(preds, axis=2)
...@@ -96,7 +131,6 @@ class NERTransformer(BaseTransformer): ...@@ -96,7 +131,6 @@ class NERTransformer(BaseTransformer):
} }
if self.is_logger(): if self.is_logger():
logger.info(self.proc_rank)
logger.info("***** Eval results *****") logger.info("***** Eval results *****")
for key in sorted(results.keys()): for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key])) logger.info(" %s = %s", key, str(results[key]))
...@@ -140,56 +174,6 @@ class NERTransformer(BaseTransformer): ...@@ -140,56 +174,6 @@ class NERTransformer(BaseTransformer):
) )
return ret return ret
def load_and_cache_examples(self, labels, pad_token_label_id, mode):
args = self.hparams
tokenizer = self.tokenizer
if self.proc_rank not in [-1, 0] and mode == "train":
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
# Load data features from cache or dataset file
cached_features_file = os.path.join(
args.data_dir,
"cached_{}_{}_{}".format(
mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
),
)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file)
else:
logger.info("Creating features from dataset file at %s", args.data_dir)
examples = read_examples_from_file(args.data_dir, mode)
features = convert_examples_to_features(
examples,
labels,
args.max_seq_length,
tokenizer,
cls_token_at_end=bool(args.model_type in ["xlnet"]),
cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token,
sep_token_extra=bool(args.model_type in ["roberta"]),
pad_on_left=bool(args.model_type in ["xlnet"]),
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
pad_token_label_id=pad_token_label_id,
)
if self.proc_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file)
if self.proc_rank == 0 and mode == "train":
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
return dataset
@staticmethod @staticmethod
def add_model_specific_args(parser, root_dir): def add_model_specific_args(parser, root_dir):
# Add NER specific options # Add NER specific options
......
import logging
import os import os
import random import random
...@@ -26,6 +27,9 @@ from transformers import ( ...@@ -26,6 +27,9 @@ from transformers import (
) )
logger = logging.getLogger(__name__)
ALL_MODELS = sum( ALL_MODELS = sum(
( (
tuple(conf.pretrained_config_archive_map.keys()) tuple(conf.pretrained_config_archive_map.keys())
...@@ -77,20 +81,14 @@ class BaseTransformer(pl.LightningModule): ...@@ -77,20 +81,14 @@ class BaseTransformer(pl.LightningModule):
cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None,
) )
self.config, self.tokenizer, self.model = config, tokenizer, model self.config, self.tokenizer, self.model = config, tokenizer, model
self.proc_rank = -1
def is_logger(self): def is_logger(self):
return self.proc_rank <= 0 return self.trainer.proc_rank <= 0
def configure_optimizers(self): def configure_optimizers(self):
"Prepare optimizer and schedule (linear warmup and decay)" "Prepare optimizer and schedule (linear warmup and decay)"
model = self.model
t_total = ( model = self.model
len(self.train_dataloader())
// self.hparams.gradient_accumulation_steps
* float(self.hparams.num_train_epochs)
)
no_decay = ["bias", "LayerNorm.weight"] no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{ {
...@@ -103,18 +101,16 @@ class BaseTransformer(pl.LightningModule): ...@@ -103,18 +101,16 @@ class BaseTransformer(pl.LightningModule):
}, },
] ]
optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
scheduler = get_linear_schedule_with_warmup( self.opt = optimizer
optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
)
self.lr_scheduler = scheduler
return [optimizer] return [optimizer]
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None): def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
if self.trainer.use_tpu:
# Step each time. xm.optimizer_step(optimizer)
optimizer.step() else:
self.lr_scheduler.step() optimizer.step()
optimizer.zero_grad() optimizer.zero_grad()
self.lr_scheduler.step()
def get_tqdm_dict(self): def get_tqdm_dict(self):
tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]} tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
...@@ -127,22 +123,27 @@ class BaseTransformer(pl.LightningModule): ...@@ -127,22 +123,27 @@ class BaseTransformer(pl.LightningModule):
def test_end(self, outputs): def test_end(self, outputs):
return self.validation_end(outputs) return self.validation_end(outputs)
@pl.data_loader
def train_dataloader(self): def train_dataloader(self):
return self.load_dataset("train", self.hparams.train_batch_size) train_batch_size = self.hparams.train_batch_size
dataloader = self.load_dataset("train", train_batch_size)
t_total = (
(len(dataloader.dataset) // (train_batch_size * max(1, self.hparams.n_gpu)))
// self.hparams.gradient_accumulation_steps
* float(self.hparams.num_train_epochs)
)
scheduler = get_linear_schedule_with_warmup(
self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
)
self.lr_scheduler = scheduler
return dataloader
@pl.data_loader
def val_dataloader(self): def val_dataloader(self):
return self.load_dataset("dev", self.hparams.eval_batch_size) return self.load_dataset("dev", self.hparams.eval_batch_size)
@pl.data_loader
def test_dataloader(self): def test_dataloader(self):
return self.load_dataset("test", self.hparams.eval_batch_size) return self.load_dataset("test", self.hparams.eval_batch_size)
def init_ddp_connection(self, proc_rank, world_size):
self.proc_rank = proc_rank
super(BaseTransformer, self).init_ddp_connection(proc_rank, world_size)
@staticmethod @staticmethod
def add_model_specific_args(parser, root_dir): def add_model_specific_args(parser, root_dir):
parser.add_argument( parser.add_argument(
...@@ -213,6 +214,7 @@ def add_generic_args(parser, root_dir): ...@@ -213,6 +214,7 @@ def add_generic_args(parser, root_dir):
) )
parser.add_argument("--n_gpu", type=int, default=1) parser.add_argument("--n_gpu", type=int, default=1)
parser.add_argument("--n_tpu_cores", type=int, default=0)
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
...@@ -252,13 +254,22 @@ def generic_train(model, args): ...@@ -252,13 +254,22 @@ def generic_train(model, args):
accumulate_grad_batches=args.gradient_accumulation_steps, accumulate_grad_batches=args.gradient_accumulation_steps,
gpus=args.n_gpu, gpus=args.n_gpu,
max_epochs=args.num_train_epochs, max_epochs=args.num_train_epochs,
early_stop_callback=False,
gradient_clip_val=args.max_grad_norm, gradient_clip_val=args.max_grad_norm,
checkpoint_callback=checkpoint_callback, checkpoint_callback=checkpoint_callback,
) )
if args.fp16: if args.fp16:
train_params["use_amp"] = args.fp16 train_params["use_amp"] = args.fp16
train_params["amp_level"] = args.fp16_opt_level train_params["amp_level"] = args.fp16_opt_level
if args.n_tpu_cores > 0:
global xm
import torch_xla.core.xla_model as xm
train_params["num_tpu_cores"] = args.n_tpu_cores
train_params["gpus"] = 0
if args.n_gpu > 1: if args.n_gpu > 1:
train_params["distributed_backend"] = "ddp" train_params["distributed_backend"] = "ddp"
......
...@@ -338,7 +338,7 @@ def main(): ...@@ -338,7 +338,7 @@ def main():
# Setup devices and distributed training # Setup devices and distributed training
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: else:
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
args.device = torch.device("cuda", args.local_rank) args.device = torch.device("cuda", args.local_rank)
......
...@@ -189,7 +189,7 @@ def main(): ...@@ -189,7 +189,7 @@ def main():
args = parser.parse_args() args = parser.parse_args()
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
set_seed(args) set_seed(args)
......
...@@ -183,8 +183,11 @@ def train(args, train_dataset, model, tokenizer): ...@@ -183,8 +183,11 @@ def train(args, train_dataset, model, tokenizer):
steps_trained_in_current_epoch = 0 steps_trained_in_current_epoch = 0
# Check if continuing training from a checkpoint # Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path): if os.path.exists(args.model_name_or_path):
# set global_step to gobal_step of last saved checkpoint from model path # set global_step to global_step of last saved checkpoint from model path
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) try:
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
except ValueError:
global_step = 0
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
...@@ -575,7 +578,7 @@ def main(): ...@@ -575,7 +578,7 @@ def main():
# Setup CUDA, GPU & distributed training # Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
......
...@@ -663,7 +663,7 @@ def main(): ...@@ -663,7 +663,7 @@ def main():
# Setup CUDA, GPU & distributed training # Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
......
...@@ -535,7 +535,7 @@ def main(): ...@@ -535,7 +535,7 @@ def main():
# Setup CUDA, GPU & distributed training # Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
......
...@@ -725,7 +725,7 @@ def main(): ...@@ -725,7 +725,7 @@ def main():
# Setup CUDA, GPU & distributed training # Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
......
...@@ -530,7 +530,7 @@ def main(): ...@@ -530,7 +530,7 @@ def main():
# Setup CUDA, GPU & distributed training # Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
......
### Get the CNN/Daily Mail Data
To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
```bash
tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
```
this should make a directory called cnn_dm/ with files like `test.source`.
To use your own data, copy that files format. Each article to be summarized is on its own line.
### Usage
To create summaries for each article in dataset, run:
```bash
python evaluate_cnn.py <path_to_test.source> cnn_test_summaries.txt
```
the default batch size, 8, fits in 16GB GPU memory, but may need to be adjusted to fit your system.
### Where is the code?
The core model is in `src/transformers/modeling_bart.py`. This directory only contains examples.
### (WIP) Rouge Scores
### Stanford CoreNLP Setup
```
ptb_tokenize () {
cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > $2
}
sudo apt install openjdk-8-jre-headless
sudo apt-get install ant
wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
unzip stanford-corenlp-full-2018-10-05.zip
cd stanford-corenlp-full-2018-10-05
export CLASSPATH=stanford-corenlp-3.9.2.jar:stanford-corenlp-3.9.2-models.jar
```
### Rouge Setup
Install `files2rouge` following the instructions at [here](https://github.com/pltrdy/files2rouge).
I also needed to run `sudo apt-get install libxml-parser-perl`
```python
from files2rouge import files2rouge
from files2rouge import settings
files2rouge.run(<path_to_tokenized_hypo>,
<path_to_tokenized_target>,
saveto='rouge_output.txt')
```
import argparse
from pathlib import Path
import torch
from tqdm import tqdm
from transformers import BartForMaskedLM, BartTokenizer
DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i : i + n]
def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE):
fout = Path(out_file).open("w")
model = BartForMaskedLM.from_pretrained("bart-large-cnn", output_past=True,)
tokenizer = BartTokenizer.from_pretrained("bart-large")
for batch in tqdm(list(chunks(lns, batch_size))):
dct = tokenizer.batch_encode_plus(batch, max_length=1024, return_tensors="pt", pad_to_max_length=True)
summaries = model.generate(
input_ids=dct["input_ids"].to(device),
attention_mask=dct["attention_mask"].to(device),
num_beams=4,
length_penalty=2.0,
max_length=140,
min_len=55,
no_repeat_ngram_size=3,
)
dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
for hypothesis in dec:
fout.write(hypothesis + "\n")
fout.flush()
def _run_generate():
parser = argparse.ArgumentParser()
parser.add_argument(
"source_path", type=str, help="like cnn_dm/test.source",
)
parser.add_argument(
"output_path", type=str, help="where to save summaries",
)
parser.add_argument(
"--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.",
)
parser.add_argument(
"--bs", type=int, default=8, required=False, help="batch size: how many to summarize at a time",
)
args = parser.parse_args()
lns = [" " + x.rstrip() for x in open(args.source_path).readlines()]
generate_summaries(lns, args.output_path, batch_size=args.bs, device=args.device)
if __name__ == "__main__":
_run_generate()
import logging
import sys
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
from .evaluate_cnn import _run_generate
articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
class TestBartExamples(unittest.TestCase):
def test_bart_cnn_cli(self):
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)
tmp = Path(tempfile.gettempdir()) / "utest_generations.hypo"
with tmp.open("w") as f:
f.write("\n".join(articles))
testargs = ["evaluate_cnn.py", str(tmp), "output.txt"]
with patch.object(sys, "argv", testargs):
_run_generate()
self.assertTrue(Path("output.txt").exists())
...@@ -15,7 +15,7 @@ pip install nltk py-rouge ...@@ -15,7 +15,7 @@ pip install nltk py-rouge
cd examples/summarization cd examples/summarization
``` ```
## Reproduce the authors' results on ROUGE ## Reproduce the authors' ROUGE score
To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running: To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment