Commit 76f0d99f authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'refs/remotes/huggingface/master'

parents 40ed7172 7bd11dda
...@@ -58,7 +58,7 @@ Choose the right framework for every part of a model's lifetime ...@@ -58,7 +58,7 @@ Choose the right framework for every part of a model's lifetime
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers | | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers | | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
| [Documentation][(v2.2.0/v2.2.1)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more | | [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
## Installation ## Installation
......
...@@ -26,7 +26,7 @@ author = u'huggingface' ...@@ -26,7 +26,7 @@ author = u'huggingface'
# The short X.Y version # The short X.Y version
version = u'' version = u''
# The full version, including alpha/beta/rc tags # The full version, including alpha/beta/rc tags
release = u'2.2.1' release = u'2.2.2'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
......
...@@ -580,10 +580,16 @@ def main(): ...@@ -580,10 +580,16 @@ def main():
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
results = {} results = {}
if args.do_eval and args.local_rank in [-1, 0]: if args.do_eval and args.local_rank in [-1, 0]:
checkpoints = [args.output_dir]
if args.eval_all_checkpoints: if args.do_train:
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logger.info("Loading checkpoints saved during training for evaluation")
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs checkpoints = [args.output_dir]
if args.eval_all_checkpoints:
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
else:
logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
checkpoints = [args.model_name_or_path]
logger.info("Evaluate the following checkpoints: %s", checkpoints) logger.info("Evaluate the following checkpoints: %s", checkpoints)
......
...@@ -29,7 +29,7 @@ And move all the stories to the same folder. We will refer as `$DATA_PATH` the p ...@@ -29,7 +29,7 @@ And move all the stories to the same folder. We will refer as `$DATA_PATH` the p
python run_summarization.py \ python run_summarization.py \
--documents_dir $DATA_PATH \ --documents_dir $DATA_PATH \
--summaries_output_dir $SUMMARIES_PATH \ # optional --summaries_output_dir $SUMMARIES_PATH \ # optional
--to_cpu false \ --no_cuda false \
--batch_size 4 \ --batch_size 4 \
--min_length 50 \ --min_length 50 \
--max_length 200 \ --max_length 200 \
...@@ -39,7 +39,7 @@ python run_summarization.py \ ...@@ -39,7 +39,7 @@ python run_summarization.py \
--compute_rouge true --compute_rouge true
``` ```
The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize). The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
## Summarize any text ## Summarize any text
...@@ -49,7 +49,7 @@ Put the documents that you would like to summarize in a folder (the path to whic ...@@ -49,7 +49,7 @@ Put the documents that you would like to summarize in a folder (the path to whic
python run_summarization.py \ python run_summarization.py \
--documents_dir $DATA_PATH \ --documents_dir $DATA_PATH \
--summaries_output_dir $SUMMARIES_PATH \ # optional --summaries_output_dir $SUMMARIES_PATH \ # optional
--to_cpu false \ --no_cuda false \
--batch_size 4 \ --batch_size 4 \
--min_length 50 \ --min_length 50 \
--max_length 200 \ --max_length 200 \
......
...@@ -44,7 +44,7 @@ extras['all'] = [package for package in extras.values()] ...@@ -44,7 +44,7 @@ extras['all'] = [package for package in extras.values()]
setup( setup(
name="transformers", name="transformers",
version="2.2.1", version="2.2.2",
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors", author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
author_email="thomas@huggingface.co", author_email="thomas@huggingface.co",
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch", description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
......
__version__ = "2.2.1" __version__ = "2.2.2"
# Work around to update TensorFlow's absl.logging threshold which alters the # Work around to update TensorFlow's absl.logging threshold which alters the
# default Python logging output behavior when present. # default Python logging output behavior when present.
......
...@@ -19,8 +19,8 @@ class UserCommands(BaseTransformersCLICommand): ...@@ -19,8 +19,8 @@ class UserCommands(BaseTransformersCLICommand):
list_parser.set_defaults(func=lambda args: ListObjsCommand(args)) list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
# upload # upload
upload_parser = parser.add_parser('upload') upload_parser = parser.add_parser('upload')
upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.') upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.') upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
upload_parser.set_defaults(func=lambda args: UploadCommand(args)) upload_parser.set_defaults(func=lambda args: UploadCommand(args))
...@@ -138,28 +138,57 @@ class ListObjsCommand(BaseUserCommand): ...@@ -138,28 +138,57 @@ class ListObjsCommand(BaseUserCommand):
class UploadCommand(BaseUserCommand): class UploadCommand(BaseUserCommand):
def walk_dir(self, rel_path):
"""
Recursively list all files in a folder.
"""
entries: List[os.DirEntry] = list(os.scandir(rel_path))
files = [
(
os.path.join(os.getcwd(), f.path), # filepath
f.path # filename
)
for f in entries if f.is_file()
]
for f in entries:
if f.is_dir():
files += self.walk_dir(f.path)
return files
def run(self): def run(self):
token = HfFolder.get_token() token = HfFolder.get_token()
if token is None: if token is None:
print("Not logged in") print("Not logged in")
exit(1) exit(1)
filepath = os.path.join(os.getcwd(), self.args.file) local_path = os.path.abspath(self.args.path)
filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath) if os.path.isdir(local_path):
print( if self.args.filename is not None:
"About to upload file {} to S3 under filename {}".format( raise ValueError("Cannot specify a filename override when uploading a folder.")
ANSI.bold(filepath), ANSI.bold(filename) rel_path = os.path.basename(local_path)
files = self.walk_dir(rel_path)
elif os.path.isfile(local_path):
filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
files = [(local_path, filename)]
else:
raise ValueError("Not a valid file or directory: {}".format(local_path))
for filepath, filename in files:
print(
"About to upload file {} to S3 under filename {}".format(
ANSI.bold(filepath), ANSI.bold(filename)
)
) )
)
choice = input("Proceed? [Y/n] ").lower() choice = input("Proceed? [Y/n] ").lower()
if not(choice == "" or choice == "y" or choice == "yes"): if not(choice == "" or choice == "y" or choice == "yes"):
print("Abort") print("Abort")
exit() exit()
print( print(
ANSI.bold("Uploading... This might take a while if file is large") ANSI.bold("Uploading... This might take a while if files are large")
) )
access_url = self._api.presign_and_upload( for filepath, filename in files:
token=token, filename=filename, filepath=filepath access_url = self._api.presign_and_upload(
) token=token, filename=filename, filepath=filepath
print("Your file now lives at:") )
print(access_url) print("Your file now lives at:")
print(access_url)
...@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer, ...@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
if is_tf_available() and is_tf_dataset: if is_tf_available() and is_tf_dataset:
def gen(): def gen():
for ex in features: for ex in features:
yield ({'input_ids': ex.input_ids, yield ({'input_ids': ex.input_ids,
'attention_mask': ex.attention_mask, 'attention_mask': ex.attention_mask,
'token_type_ids': ex.token_type_ids}, 'token_type_ids': ex.token_type_ids},
ex.label) ex.label)
......
This diff is collapsed.
...@@ -139,5 +139,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -139,5 +139,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert encoded_sentence == [101] + text + [102] assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -67,6 +67,5 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -67,6 +67,5 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -232,6 +232,15 @@ class CommonTestCases: ...@@ -232,6 +232,15 @@ class CommonTestCases:
self.assertNotEqual(len(tokens_2), 0) self.assertNotEqual(len(tokens_2), 0)
self.assertIsInstance(text_2, (str, unicode)) self.assertIsInstance(text_2, (str, unicode))
def test_encode_decode_with_spaces(self):
tokenizer = self.get_tokenizer()
new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
tokenizer.add_tokens(new_toks)
input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded = tokenizer.encode(input, add_special_tokens=False)
decoded = tokenizer.decode(encoded)
self.assertEqual(decoded, input)
def test_pretrained_model_lists(self): def test_pretrained_model_lists(self):
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
......
...@@ -637,9 +637,11 @@ class PreTrainedTokenizer(object): ...@@ -637,9 +637,11 @@ class PreTrainedTokenizer(object):
text: The sequence to be encoded. text: The sequence to be encoded.
**kwargs: passed to the child `self.tokenize()` method **kwargs: passed to the child `self.tokenize()` method
""" """
all_special_tokens = self.all_special_tokens
def lowercase_text(t): def lowercase_text(t):
# convert non-special tokens to lowercase # convert non-special tokens to lowercase
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens] escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \ pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
r'(.+?)' r'(.+?)'
return re.sub( return re.sub(
...@@ -680,17 +682,17 @@ class PreTrainedTokenizer(object): ...@@ -680,17 +682,17 @@ class PreTrainedTokenizer(object):
tokenized_text = [] tokenized_text = []
for sub_text in text_list: for sub_text in text_list:
if sub_text not in self.added_tokens_encoder \ if sub_text not in self.added_tokens_encoder \
and sub_text not in self.all_special_tokens: and sub_text not in all_special_tokens:
tokenized_text += split_on_token(tok, sub_text) tokenized_text += split_on_token(tok, sub_text)
else: else:
tokenized_text += [sub_text] tokenized_text += [sub_text]
text_list = tokenized_text text_list = tokenized_text
return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \ return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
in self.added_tokens_encoder and token not in self.all_special_tokens \ in self.added_tokens_encoder and token not in all_special_tokens \
else [token] for token in tokenized_text))) else [token] for token in tokenized_text)))
added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens added_tokens = list(self.added_tokens_encoder.keys()) + all_special_tokens
tokenized_text = split_on_tokens(added_tokens, text) tokenized_text = split_on_tokens(added_tokens, text)
return tokenized_text return tokenized_text
...@@ -1178,12 +1180,12 @@ class PreTrainedTokenizer(object): ...@@ -1178,12 +1180,12 @@ class PreTrainedTokenizer(object):
if current_sub_text: if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) sub_texts.append(self.convert_tokens_to_string(current_sub_text))
current_sub_text = [] current_sub_text = []
sub_texts.append(" " + token) sub_texts.append(token)
else: else:
current_sub_text.append(token) current_sub_text.append(token)
if current_sub_text: if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) sub_texts.append(self.convert_tokens_to_string(current_sub_text))
text = ''.join(sub_texts) text = ' '.join(sub_texts)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment