Commit 40ed7172 authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'refs/remotes/huggingface/master'

parents 86a63070 7296f101
...@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, ...@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
XLNetTokenizer, RobertaConfig, XLNetTokenizer, RobertaConfig,
RobertaForMultipleChoice, RobertaTokenizer) RobertaForMultipleChoice, RobertaTokenizer)
from transformers import AdamW, WarmupLinearSchedule from transformers import AdamW, get_linear_schedule_with_warmup
from utils_multiple_choice import (convert_examples_to_features, processors) from utils_multiple_choice import (convert_examples_to_features, processors)
...@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer): ...@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16: if args.fp16:
try: try:
from apex import amp from apex import amp
...@@ -226,9 +226,13 @@ def evaluate(args, model, tokenizer, prefix="", test=False): ...@@ -226,9 +226,13 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# Note that DistributedSampler samples randomly # Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# multi-gpu evaluate
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Eval! # Eval!
logger.info("***** Running evaluation {} *****".format(prefix)) logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Num examples = %d", len(eval_dataset))
...@@ -464,9 +468,17 @@ def main(): ...@@ -464,9 +468,17 @@ def main():
args.model_type = args.model_type.lower() args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) num_labels=num_labels,
model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) finetuning_task=args.task_name,
cache_dir=args.cache_dir if args.cache_dir else None)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
do_lower_case=args.do_lower_case,
cache_dir=args.cache_dir if args.cache_dir else None)
model = model_class.from_pretrained(args.model_name_or_path,
from_tf=bool('.ckpt' in args.model_name_or_path),
config=config,
cache_dir=args.cache_dir if args.cache_dir else None)
if args.local_rank == 0: if args.local_rank == 0:
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert). """ """ Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
...@@ -33,17 +33,23 @@ from torch.utils.data.distributed import DistributedSampler ...@@ -33,17 +33,23 @@ from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange from tqdm import tqdm, trange
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
from transformers import AdamW, WarmupLinearSchedule from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ALL_MODELS = sum( ALL_MODELS = sum(
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, )), (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
()) ())
MODEL_CLASSES = { MODEL_CLASSES = {
"bert": (BertConfig, BertForTokenClassification, BertTokenizer), "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
"roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
"distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
"camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
} }
...@@ -78,7 +84,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): ...@@ -78,7 +84,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0} {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
] ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16: if args.fp16:
try: try:
from apex import amp from apex import amp
...@@ -119,9 +125,10 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): ...@@ -119,9 +125,10 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
batch = tuple(t.to(args.device) for t in batch) batch = tuple(t.to(args.device) for t in batch)
inputs = {"input_ids": batch[0], inputs = {"input_ids": batch[0],
"attention_mask": batch[1], "attention_mask": batch[1],
"token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
# XLM and RoBERTa don"t use segment_ids
"labels": batch[3]} "labels": batch[3]}
if args.model_type != "distilbert":
inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
outputs = model(**inputs) outputs = model(**inputs)
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
...@@ -133,13 +140,16 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): ...@@ -133,13 +140,16 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
if args.fp16: if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss: with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else: else:
loss.backward() loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
tr_loss += loss.item() tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0: if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
scheduler.step() # Update learning rate schedule scheduler.step() # Update learning rate schedule
optimizer.step() optimizer.step()
model.zero_grad() model.zero_grad()
...@@ -148,7 +158,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): ...@@ -148,7 +158,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
# Log metrics # Log metrics
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id) results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
for key, value in results.items(): for key, value in results.items():
tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("eval_{}".format(key), value, global_step)
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
...@@ -186,6 +196,10 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix="" ...@@ -186,6 +196,10 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# multi-gpu evaluate
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Eval! # Eval!
logger.info("***** Running evaluation %s *****", prefix) logger.info("***** Running evaluation %s *****", prefix)
logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Num examples = %d", len(eval_dataset))
...@@ -201,12 +215,15 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix="" ...@@ -201,12 +215,15 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
with torch.no_grad(): with torch.no_grad():
inputs = {"input_ids": batch[0], inputs = {"input_ids": batch[0],
"attention_mask": batch[1], "attention_mask": batch[1],
"token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
# XLM and RoBERTa don"t use segment_ids
"labels": batch[3]} "labels": batch[3]}
if args.model_type != "distilbert":
inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
outputs = model(**inputs) outputs = model(**inputs)
tmp_eval_loss, logits = outputs[:2] tmp_eval_loss, logits = outputs[:2]
if args.n_gpu > 1:
tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating
eval_loss += tmp_eval_loss.item() eval_loss += tmp_eval_loss.item()
nb_eval_steps += 1 nb_eval_steps += 1
if preds is None: if preds is None:
...@@ -420,11 +437,15 @@ def main(): ...@@ -420,11 +437,15 @@ def main():
args.model_type = args.model_type.lower() args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
num_labels=num_labels) num_labels=num_labels,
cache_dir=args.cache_dir if args.cache_dir else None)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
do_lower_case=args.do_lower_case) do_lower_case=args.do_lower_case,
model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), cache_dir=args.cache_dir if args.cache_dir else None)
config=config) model = model_class.from_pretrained(args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
cache_dir=args.cache_dir if args.cache_dir else None)
if args.local_rank == 0: if args.local_rank == 0:
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
...@@ -508,3 +529,4 @@ def main(): ...@@ -508,3 +529,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()
This diff is collapsed.
import os import os
import tensorflow as tf import tensorflow as tf
import tensorflow_datasets import tensorflow_datasets
from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors
# script parameters # script parameters
BATCH_SIZE = 32 BATCH_SIZE = 32
EVAL_BATCH_SIZE = BATCH_SIZE * 2 EVAL_BATCH_SIZE = BATCH_SIZE * 2
USE_XLA = False USE_XLA = False
USE_AMP = False USE_AMP = False
EPOCHS = 3
TASK = "mrpc"
if TASK == "sst-2":
TFDS_TASK = "sst2"
elif TASK == "sts-b":
TFDS_TASK = "stsb"
else:
TFDS_TASK = TASK
num_labels = len(glue_processors[TASK]().get_labels())
print(num_labels)
tf.config.optimizer.set_jit(USE_XLA) tf.config.optimizer.set_jit(USE_XLA)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
# Load tokenizer and model from pretrained model/vocabulary # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased') tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased') model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
# Load dataset via TensorFlow Datasets # Load dataset via TensorFlow Datasets
data, info = tensorflow_datasets.load('glue/mrpc', with_info=True) data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
train_examples = info.splits['train'].num_examples train_examples = info.splits['train'].num_examples
# MNLI expects either validation_matched or validation_mismatched
valid_examples = info.splits['validation'].num_examples valid_examples = info.splits['validation'].num_examples
# Prepare dataset for GLUE as a tf.data.Dataset instance # Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc') train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
# MNLI expects either validation_matched or validation_mismatched
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1) train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE) valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
...@@ -32,7 +50,13 @@ opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) ...@@ -32,7 +50,13 @@ opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
if USE_AMP: if USE_AMP:
# loss scaling is currently required when using mixed precision # loss scaling is currently required when using mixed precision
opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
if num_labels == 1:
loss = tf.keras.losses.MeanSquaredError()
else:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=opt, loss=loss, metrics=[metric]) model.compile(optimizer=opt, loss=loss, metrics=[metric])
...@@ -40,24 +64,30 @@ model.compile(optimizer=opt, loss=loss, metrics=[metric]) ...@@ -40,24 +64,30 @@ model.compile(optimizer=opt, loss=loss, metrics=[metric])
train_steps = train_examples//BATCH_SIZE train_steps = train_examples//BATCH_SIZE
valid_steps = valid_examples//EVAL_BATCH_SIZE valid_steps = valid_examples//EVAL_BATCH_SIZE
history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
validation_data=valid_dataset, validation_steps=valid_steps) validation_data=valid_dataset, validation_steps=valid_steps)
# Save TF2 model # Save TF2 model
os.makedirs('./save/', exist_ok=True) os.makedirs('./save/', exist_ok=True)
model.save_pretrained('./save/') model.save_pretrained('./save/')
# Load the TensorFlow model in PyTorch for inspection if TASK == "mrpc":
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True) # Load the TensorFlow model in PyTorch for inspection
# This is to demo the interoperability between the two frameworks, you don't have to
# do this in real life (you can run the inference on the TF model).
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = 'This research was consistent with his findings.'
sentence_1 = 'His findings were compatible with this research.'
sentence_2 = 'His findings were not compatible with this research.'
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task del inputs_1["special_tokens_mask"]
sentence_0 = 'This research was consistent with his findings.' del inputs_2["special_tokens_mask"]
sentence_1 = 'His findings were compatible with this research.'
sentence_2 = 'His findings were not compatible with this research.'
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
pred_1 = pytorch_model(**inputs_1)[0].argmax().item() pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
pred_2 = pytorch_model(**inputs_2)[0].argmax().item() pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0') print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0') print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
This diff is collapsed.
This diff is collapsed.
# Text Summarization with Pretrained Encoders
This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
## Setup
```
git clone https://github.com/huggingface/transformers && cd transformers
pip install [--editable] .
pip install nltk py-rouge
cd examples/summarization
```
## Reproduce the authors' results on ROUGE
To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
```bash
tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
```
And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
```bash
python run_summarization.py \
--documents_dir $DATA_PATH \
--summaries_output_dir $SUMMARIES_PATH \ # optional
--to_cpu false \
--batch_size 4 \
--min_length 50 \
--max_length 200 \
--beam_size 5 \
--alpha 0.95 \
--block_trigram true \
--compute_rouge true
```
The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
## Summarize any text
Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
```bash
python run_summarization.py \
--documents_dir $DATA_PATH \
--summaries_output_dir $SUMMARIES_PATH \ # optional
--to_cpu false \
--batch_size 4 \
--min_length 50 \
--max_length 200 \
--beam_size 5 \
--alpha 0.95 \
--block_trigram true \
```
You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
This diff is collapsed.
This diff is collapsed.
# progress bars in model download and training scripts
tqdm
# Accessing files from S3 directly.
boto3
# Used for downloading models over HTTP
requests
# For ROUGE
nltk
py-rouge
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase): ...@@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase):
logger.addHandler(stream_handler) logger.addHandler(stream_handler)
testargs = ["run_squad.py", testargs = ["run_squad.py",
"--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json", "--data_dir=./examples/tests_samples/SQUAD",
"--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
"--model_name=bert-base-uncased", "--model_name=bert-base-uncased",
"--output_dir=./examples/tests_samples/temp_dir", "--output_dir=./examples/tests_samples/temp_dir",
"--max_steps=10", "--max_steps=10",
......
This diff is collapsed.
This diff is collapsed.
...@@ -36,9 +36,15 @@ To create the package for pypi. ...@@ -36,9 +36,15 @@ To create the package for pypi.
from io import open from io import open
from setuptools import find_packages, setup from setuptools import find_packages, setup
extras = {
'serving': ['uvicorn', 'fastapi']
}
extras['all'] = [package for package in extras.values()]
setup( setup(
name="transformers", name="transformers",
version="2.1.1", version="2.2.1",
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors", author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
author_email="thomas@huggingface.co", author_email="thomas@huggingface.co",
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch", description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
...@@ -61,8 +67,11 @@ setup( ...@@ -61,8 +67,11 @@ setup(
"transformers=transformers.__main__:main", "transformers=transformers.__main__:main",
] ]
}, },
extras_require=extras,
scripts=[
'transformers-cli'
],
# python_requires='>=3.5.0', # python_requires='>=3.5.0',
tests_require=['pytest'],
classifiers=[ classifiers=[
'Intended Audience :: Science/Research', 'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License', 'License :: OSI Approved :: Apache Software License',
......
# How to add a new example script in 🤗Transformers
This folder provide a template for adding a new example script implementing a training or inference task with the models in the 🤗Transformers library.
Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment