"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "45d21502f0b67eb8a5ad244d469dcc0dfb7517a7"
Unverified Commit 3a2c4e6f authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #1548 from huggingface/cli

[2.2] - Command-line interface - Pipeline class
parents 4e3f745b db0795b5
...@@ -281,7 +281,9 @@ class PreTrainedModel(nn.Module): ...@@ -281,7 +281,9 @@ class PreTrainedModel(nn.Module):
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) one of:
- an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
- a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
...@@ -336,10 +338,11 @@ class PreTrainedModel(nn.Module): ...@@ -336,10 +338,11 @@ class PreTrainedModel(nn.Module):
proxies = kwargs.pop('proxies', None) proxies = kwargs.pop('proxies', None)
output_loading_info = kwargs.pop('output_loading_info', False) output_loading_info = kwargs.pop('output_loading_info', False)
# Load config # Load config if we don't provide a configuration
if config is None: if not isinstance(config, PretrainedConfig):
config_path = config if config is not None else pretrained_model_name_or_path
config, model_kwargs = cls.config_class.from_pretrained( config, model_kwargs = cls.config_class.from_pretrained(
pretrained_model_name_or_path, *model_args, config_path, *model_args,
cache_dir=cache_dir, return_unused_kwargs=True, cache_dir=cache_dir, return_unused_kwargs=True,
force_download=force_download, force_download=force_download,
resume_download=resume_download, resume_download=resume_download,
...@@ -408,7 +411,11 @@ class PreTrainedModel(nn.Module): ...@@ -408,7 +411,11 @@ class PreTrainedModel(nn.Module):
model = cls(config, *model_args, **model_kwargs) model = cls(config, *model_args, **model_kwargs)
if state_dict is None and not from_tf: if state_dict is None and not from_tf:
state_dict = torch.load(resolved_archive_file, map_location='cpu') try:
state_dict = torch.load(resolved_archive_file, map_location='cpu')
except:
raise OSError("Unable to load weights from pytorch checkpoint file. "
"If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")
missing_keys = [] missing_keys = []
unexpected_keys = [] unexpected_keys = []
......
This diff is collapsed.
...@@ -18,7 +18,7 @@ import os ...@@ -18,7 +18,7 @@ import os
import json import json
import unittest import unittest
from transformers.model_card import ModelCard from transformers.modelcard import ModelCard
from .tokenization_tests_commons import TemporaryDirectory from .tokenization_tests_commons import TemporaryDirectory
class ModelCardTester(unittest.TestCase): class ModelCardTester(unittest.TestCase):
...@@ -49,20 +49,20 @@ class ModelCardTester(unittest.TestCase): ...@@ -49,20 +49,20 @@ class ModelCardTester(unittest.TestCase):
} }
def test_model_card_common_properties(self): def test_model_card_common_properties(self):
model_card = ModelCard.from_dict(self.inputs_dict) modelcard = ModelCard.from_dict(self.inputs_dict)
self.assertTrue(hasattr(model_card, 'model_details')) self.assertTrue(hasattr(modelcard, 'model_details'))
self.assertTrue(hasattr(model_card, 'intended_use')) self.assertTrue(hasattr(modelcard, 'intended_use'))
self.assertTrue(hasattr(model_card, 'factors')) self.assertTrue(hasattr(modelcard, 'factors'))
self.assertTrue(hasattr(model_card, 'metrics')) self.assertTrue(hasattr(modelcard, 'metrics'))
self.assertTrue(hasattr(model_card, 'evaluation_data')) self.assertTrue(hasattr(modelcard, 'evaluation_data'))
self.assertTrue(hasattr(model_card, 'training_data')) self.assertTrue(hasattr(modelcard, 'training_data'))
self.assertTrue(hasattr(model_card, 'quantitative_analyses')) self.assertTrue(hasattr(modelcard, 'quantitative_analyses'))
self.assertTrue(hasattr(model_card, 'ethical_considerations')) self.assertTrue(hasattr(modelcard, 'ethical_considerations'))
self.assertTrue(hasattr(model_card, 'caveats_and_recommendations')) self.assertTrue(hasattr(modelcard, 'caveats_and_recommendations'))
def test_model_card_to_json_string(self): def test_model_card_to_json_string(self):
model_card = ModelCard.from_dict(self.inputs_dict) modelcard = ModelCard.from_dict(self.inputs_dict)
obj = json.loads(model_card.to_json_string()) obj = json.loads(modelcard.to_json_string())
for key, value in self.inputs_dict.items(): for key, value in self.inputs_dict.items():
self.assertEqual(obj[key], value) self.assertEqual(obj[key], value)
...@@ -70,7 +70,7 @@ class ModelCardTester(unittest.TestCase): ...@@ -70,7 +70,7 @@ class ModelCardTester(unittest.TestCase):
model_card_first = ModelCard.from_dict(self.inputs_dict) model_card_first = ModelCard.from_dict(self.inputs_dict)
with TemporaryDirectory() as tmpdirname: with TemporaryDirectory() as tmpdirname:
filename = os.path.join(tmpdirname, u"model_card.json") filename = os.path.join(tmpdirname, u"modelcard.json")
model_card_first.to_json_file(filename) model_card_first.to_json_file(filename)
model_card_second = ModelCard.from_json_file(filename) model_card_second = ModelCard.from_json_file(filename)
......
import unittest
from typing import Iterable
from transformers import pipeline
from transformers.tests.utils import require_tf, require_torch
QA_FINETUNED_MODELS = {
('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
}
TF_QA_FINETUNED_MODELS = {
('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
}
TF_NER_FINETUNED_MODELS = {
(
'bert-base-cased',
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
)
}
NER_FINETUNED_MODELS = {
(
'bert-base-cased',
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
)
}
FEATURE_EXTRACT_FINETUNED_MODELS = {
('bert-base-cased', 'bert-base-cased', None),
# ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
('distilbert-base-uncased', 'distilbert-base-uncased', None)
}
TF_FEATURE_EXTRACT_FINETUNED_MODELS = {
('bert-base-cased', 'bert-base-cased', None),
# ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
('distilbert-base-uncased', 'distilbert-base-uncased', None)
}
TF_TEXT_CLASSIF_FINETUNED_MODELS = {
(
'bert-base-uncased',
'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
)
}
TEXT_CLASSIF_FINETUNED_MODELS = {
(
'bert-base-uncased',
'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
)
}
class MonoColumnInputTestCase(unittest.TestCase):
def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
self.assertIsNotNone(nlp)
mono_result = nlp(valid_inputs[0])
self.assertIsInstance(mono_result, list)
self.assertIsInstance(mono_result[0], (dict, list))
if isinstance(mono_result[0], list):
mono_result = mono_result[0]
for key in output_keys:
self.assertIn(key, mono_result[0])
multi_result = nlp(valid_inputs)
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], (dict, list))
if isinstance(multi_result[0], list):
multi_result = multi_result[0]
for result in multi_result:
for key in output_keys:
self.assertIn(key, result)
self.assertRaises(Exception, nlp, invalid_inputs)
@require_torch
def test_ner(self):
mandatory_keys = {'entity', 'word', 'score'}
valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
invalid_inputs = [None]
for tokenizer, model, config in NER_FINETUNED_MODELS:
nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
@require_tf
def test_tf_ner(self):
mandatory_keys = {'entity', 'word', 'score'}
valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
invalid_inputs = [None]
for tokenizer, model, config in TF_NER_FINETUNED_MODELS:
nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
@require_torch
def test_sentiment_analysis(self):
mandatory_keys = {'label'}
valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
invalid_inputs = [None]
for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
@require_tf
def test_tf_sentiment_analysis(self):
mandatory_keys = {'label'}
valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
invalid_inputs = [None]
for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
@require_torch
def test_features_extraction(self):
valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
invalid_inputs = [None]
for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
@require_tf
def test_tf_features_extraction(self):
valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
invalid_inputs = [None]
for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
class MultiColumnInputTestCase(unittest.TestCase):
def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
self.assertIsNotNone(nlp)
mono_result = nlp(valid_inputs[0])
self.assertIsInstance(mono_result, dict)
for key in output_keys:
self.assertIn(key, mono_result)
multi_result = nlp(valid_inputs)
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], dict)
for result in multi_result:
for key in output_keys:
self.assertIn(key, result)
self.assertRaises(Exception, nlp, invalid_inputs[0])
self.assertRaises(Exception, nlp, invalid_inputs)
@require_torch
def test_question_answering(self):
mandatory_output_keys = {'score', 'answer', 'start', 'end'}
valid_samples = [
{'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
{
'question': 'In what field is HuggingFace working ?',
'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
}
]
invalid_samples = [
{'question': '', 'context': 'This is a test to try empty question edge case'},
{'question': None, 'context': 'This is a test to try empty question edge case'},
{'question': 'What is does with empty context ?', 'context': ''},
{'question': 'What is does with empty context ?', 'context': None},
]
for tokenizer, model, config in QA_FINETUNED_MODELS:
nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
@require_tf
def test_tf_question_answering(self):
mandatory_output_keys = {'score', 'answer', 'start', 'end'}
valid_samples = [
{'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
{
'question': 'In what field is HuggingFace working ?',
'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
}
]
invalid_samples = [
{'question': '', 'context': 'This is a test to try empty question edge case'},
{'question': None, 'context': 'This is a test to try empty question edge case'},
{'question': 'What is does with empty context ?', 'context': ''},
{'question': 'What is does with empty context ?', 'context': None},
]
for tokenizer, model, config in TF_QA_FINETUNED_MODELS:
nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
if __name__ == '__main__':
unittest.main()
...@@ -886,6 +886,92 @@ class PreTrainedTokenizer(object): ...@@ -886,6 +886,92 @@ class PreTrainedTokenizer(object):
return_overflowing_tokens=return_overflowing_tokens, return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask) return_special_tokens_mask=return_special_tokens_mask)
def batch_encode_plus(self,
batch_text_or_text_pairs=None,
add_special_tokens=False,
max_length=None,
stride=0,
truncation_strategy='longest_first',
return_tensors=None,
return_input_lengths=False,
return_attention_masks=False,
**kwargs):
"""
Returns a dictionary containing the encoded sequence or sequence pair and additional information:
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Args:
batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded.
This can be a list of string/string-sequences/int-sequences or a list of pair of
string/string-sequences/int-sequence (see details in encode_plus)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary`
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens.
truncation_strategy: string selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
**kwargs: passed to the `self.tokenize()` method
"""
batch_outputs = {}
for ids_or_pair_ids in batch_text_or_text_pairs:
if isinstance(ids_or_pair_ids, (list, tuple)):
assert len(ids_or_pair_ids) == 2
ids, pair_ids = ids_or_pair_ids
else:
ids, pair_ids = ids_or_pair_ids, None
outputs = self.encode_plus(ids, pair_ids, add_special_tokens=add_special_tokens, max_length=max_length,
stride=stride, truncation_strategy=truncation_strategy, return_tensors=None)
# Append the non-padded length to the output
if return_input_lengths:
outputs['input_len'] = len(outputs['input_ids'])
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
# Compute longest sequence size
max_seq_len = max(map(len, batch_outputs['input_ids']))
if return_attention_masks:
# Allow the model to not give any special attention to padded input
batch_outputs['attention_mask'] = [[0] * len(v) for v in batch_outputs['input_ids']]
if return_tensors is not None:
# Do the tensor conversion in batch
for key, value in batch_outputs.items():
padded_value = value
if key != 'input_len':
# Padding handle
padded_value = [v + [self.pad_token_id if key == 'input_ids' else 1] * (max_seq_len - len(v)) for v in padded_value]
if return_tensors == 'tf' and is_tf_available():
batch_outputs[key] = tf.constant(padded_value)
elif return_tensors == 'pt' and is_torch_available():
batch_outputs[key] = torch.tensor(padded_value)
elif return_tensors is not None:
logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
# encoder_attention_mask requires 1 for real token, 0 for padding, just invert value
if return_attention_masks:
if is_tf_available():
batch_outputs['attention_mask'] = tf.abs(batch_outputs['attention_mask'] - 1)
else:
batch_outputs['attention_mask'] = torch.abs(batch_outputs['attention_mask'] - 1)
return batch_outputs
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
truncation_strategy='longest_first', truncation_strategy='longest_first',
pad_to_max_length=False, pad_to_max_length=False,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment