Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e1d89cb2
Commit
e1d89cb2
authored
Dec 06, 2019
by
Morgan Funtowicz
Browse files
Added QuestionAnsweringPipeline with batch support.
parent
81babb22
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
226 additions
and
232 deletions
+226
-232
transformers/__init__.py
transformers/__init__.py
+4
-3
transformers/pipeline.py
transformers/pipeline.py
+0
-229
transformers/pipelines.py
transformers/pipelines.py
+222
-0
No files found.
transformers/__init__.py
100644 → 100755
View file @
e1d89cb2
...
...
@@ -65,9 +65,6 @@ from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CO
from
.configuration_albert
import
AlbertConfig
,
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_camembert
import
CamembertConfig
,
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
# Pipelines
from
.pipeline
import
TextClassificationPipeline
# Modeling
if
is_torch_available
():
from
.modeling_utils
import
(
PreTrainedModel
,
prune_layer
,
Conv1D
)
...
...
@@ -193,6 +190,10 @@ from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name
load_tf2_weights_in_pytorch_model
,
load_tf2_model_in_pytorch_model
)
# Pipelines
# from .pipeline_ import TextClassificationPipeline
from
.pipelines
import
Pipeline
,
pipeline
,
TextClassificationPipeline
if
not
is_tf_available
()
and
not
is_torch_available
():
logger
.
warning
(
"Neither PyTorch nor TensorFlow >= 2.0 have been found."
"Models won't be available and only tokenizers, configuration"
...
...
transformers/pipeline.py
deleted
100644 → 0
View file @
81babb22
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Pipeline class: Tokenizer + Model. """
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
logging
import
six
from
.tokenization_auto
import
AutoTokenizer
from
.file_utils
import
add_start_docstrings
,
is_tf_available
,
is_torch_available
from
.data.processors
import
SingleSentenceClassificationProcessor
if
is_tf_available
():
import
tensorflow
as
tf
from
.modeling_tf_auto
import
(
TFAutoModel
,
TFAutoModelForQuestionAnswering
,
TFAutoModelForSequenceClassification
,
TFAutoModelWithLMHead
)
if
is_torch_available
():
import
torch
from
.modeling_auto
import
(
AutoModel
,
AutoModelForQuestionAnswering
,
AutoModelForSequenceClassification
,
AutoModelWithLMHead
)
logger
=
logging
.
getLogger
(
__name__
)
# TF training parameters
USE_XLA
=
False
USE_AMP
=
False
class
TextClassificationPipeline
(
object
):
r
"""
:class:`~transformers.TextClassificationPipeline` is a class encapsulating a pretrained model and
its tokenizer and will be instantiated as one of the base model classes of the library
when created with the `Pipeline.from_pretrained(pretrained_model_name_or_path)`
class method.
The `from_pretrained()` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertModel (DistilBERT model)
- contains `roberta`: RobertaModel (RoBERTa model)
- contains `bert`: BertModel (Bert model)
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
- contains `xlnet`: XLNetModel (XLNet model)
- contains `xlm`: XLMModel (XLM model)
"""
def
__init__
(
self
,
tokenizer
,
model
,
is_compiled
=
False
,
is_trained
=
False
):
self
.
tokenizer
=
tokenizer
self
.
model
=
model
self
.
is_compiled
=
is_compiled
self
.
is_trained
=
is_trained
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
**
kwargs
):
r
""" Instantiates a pipeline from a pre-trained tokenizer and model.
"""
# Extract tokenizer and model arguments
tokenizer_kwargs
=
{}
for
key
in
kwargs
:
if
key
.
startswith
(
'tokenizer_'
):
# Specific to the tokenizer
tokenizer_kwargs
[
key
.
replace
(
'tokenizer_'
,
''
)]
=
kwargs
.
pop
(
key
)
elif
not
key
.
startswith
(
'model_'
):
# used for both the tokenizer and the model
tokenizer_kwargs
[
key
]
=
kwargs
[
key
]
model_kwargs
=
{}
for
key
in
kwargs
:
if
key
.
startswith
(
'model_'
):
# Specific to the model
model_kwargs
[
key
.
replace
(
'model_'
,
''
)]
=
kwargs
.
pop
(
key
)
elif
not
key
.
startswith
(
'tokenizer_'
):
# used for both the tokenizer and the model
model_kwargs
[
key
]
=
kwargs
[
key
]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
**
tokenizer_kwargs
)
model_kwargs
[
'output_loading_info'
]
=
True
if
is_tf_available
():
model
,
loading_info
=
TFAutoModelForSequenceClassification
.
from_pretrained
(
pretrained_model_name_or_path
,
**
model_kwargs
)
else
:
model
,
loading_info
=
AutoModelForSequenceClassification
.
from_pretrained
(
pretrained_model_name_or_path
,
**
model_kwargs
)
return
cls
(
tokenizer
,
model
,
is_trained
=
bool
(
not
loading_info
[
'missing_keys'
]))
def
save_pretrained
(
self
,
save_directory
):
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Saving directory ({}) should be a directory"
.
format
(
save_directory
))
return
self
.
model
.
save_pretrained
(
save_directory
)
self
.
tokenizer
.
save_pretrained
(
save_directory
)
def
prepare_data
(
self
,
x
,
y
=
None
,
validation_data
=
None
,
validation_split
=
0.1
,
**
kwargs
):
dataset
=
x
if
not
isinstance
(
x
,
SingleSentenceClassificationProcessor
):
dataset
=
SingleSentenceClassificationProcessor
.
create_from_examples
(
x
,
y
)
num_data_samples
=
len
(
dataset
)
if
validation_data
is
not
None
:
valid_dataset
=
validation_data
if
not
isinstance
(
validation_data
,
SingleSentenceClassificationProcessor
):
valid_dataset
=
SingleSentenceClassificationProcessor
.
create_from_examples
(
validation_data
)
num_valid_samples
=
len
(
valid_dataset
)
train_dataset
=
dataset
num_train_samples
=
num_data_samples
else
:
assert
0.0
<=
validation_split
<=
1.0
,
"validation_split should be between 0.0 and 1.0"
num_valid_samples
=
max
(
int
(
num_data_samples
*
validation_split
),
1
)
num_train_samples
=
num_data_samples
-
num_valid_samples
train_dataset
=
dataset
[
num_valid_samples
:]
valid_dataset
=
dataset
[:
num_valid_samples
]
logger
.
info
(
'Tokenizing and processing dataset'
)
train_dataset
=
train_dataset
.
get_features
(
self
.
tokenizer
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
)
valid_dataset
=
valid_dataset
.
get_features
(
self
.
tokenizer
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
)
return
train_dataset
,
valid_dataset
def
compile
(
self
,
learning_rate
=
3e-5
,
adam_epsilon
=
1e-8
,
**
kwargs
):
if
is_tf_available
():
logger
.
info
(
'Preparing model'
)
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
opt
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
learning_rate
,
epsilon
=
adam_epsilon
)
if
USE_AMP
:
# loss scaling is currently required when using mixed precision
opt
=
tf
.
keras
.
mixed_precision
.
experimental
.
LossScaleOptimizer
(
opt
,
'dynamic'
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
metric
=
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
(
'accuracy'
)
self
.
model
.
compile
(
optimizer
=
opt
,
loss
=
loss
,
metrics
=
[
metric
])
else
:
raise
NotImplementedError
self
.
is_compiled
=
True
def
fit
(
self
,
X
=
None
,
y
=
None
,
validation_data
=
None
,
validation_split
=
0.1
,
train_batch_size
=
None
,
valid_batch_size
=
None
,
**
kwargs
):
if
not
self
.
is_compiled
:
self
.
compile
(
**
kwargs
)
train_dataset
,
valid_dataset
=
self
.
prepare_data
(
X
,
y
=
y
,
validation_data
=
validation_data
,
validation_split
=
validation_split
)
num_train_samples
=
len
(
train_dataset
)
num_valid_samples
=
len
(
valid_dataset
)
train_steps
=
num_train_samples
//
train_batch_size
valid_steps
=
num_valid_samples
//
valid_batch_size
if
is_tf_available
():
# Prepare dataset as a tf.train_data.Dataset instance
train_dataset
=
train_dataset
.
shuffle
(
128
).
batch
(
train_batch_size
).
repeat
(
-
1
)
valid_dataset
=
valid_dataset
.
batch
(
valid_batch_size
)
logger
.
info
(
'Training TF 2.0 model'
)
history
=
self
.
model
.
fit
(
train_dataset
,
epochs
=
2
,
steps_per_epoch
=
train_steps
,
validation_data
=
valid_dataset
,
validation_steps
=
valid_steps
,
**
kwargs
)
else
:
raise
NotImplementedError
self
.
is_trained
=
True
def
fit_transform
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
self
.
fit
(
*
texts
,
**
kwargs
)
return
self
(
*
texts
,
**
kwargs
)
def
transform
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
return
self
(
*
texts
,
**
kwargs
)
def
predict
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
return
self
(
*
texts
,
**
kwargs
)
def
__call__
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
if
'X'
in
kwargs
and
not
texts
:
texts
=
kwargs
.
pop
(
'X'
)
if
not
self
.
is_trained
:
logger
.
error
(
"Some weights of the model are not trained. Please fine-tune the model on a classification task before using it."
)
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
texts
,
add_special_tokens
=
True
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
)
if
is_tf_available
():
# TODO trace model
predictions
=
self
.
model
(
**
inputs
)[
0
]
else
:
with
torch
.
no_grad
():
predictions
=
self
.
model
(
**
inputs
)[
0
]
return
predictions
.
numpy
().
tolist
()
transformers/pipelines.py
0 → 100755
View file @
e1d89cb2
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
from
abc
import
ABC
,
abstractmethod
from
typing
import
Union
,
Optional
,
Tuple
import
numpy
as
np
from
transformers
import
is_tf_available
,
logger
,
AutoTokenizer
,
PreTrainedTokenizer
,
is_torch_available
if
is_tf_available
():
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
else
:
from
transformers
import
AutoModelForSequenceClassification
,
AutoModelForQuestionAnswering
class
Pipeline
(
ABC
):
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
=
None
,
**
kwargs
):
self
.
model
=
model
self
.
tokenizer
=
tokenizer
@
classmethod
@
abstractmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
raise
NotImplementedError
()
def
save_pretrained
(
self
,
save_directory
):
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Provided path ({}) should be a directory"
.
format
(
save_directory
))
return
self
.
model
.
save_pretrained
(
save_directory
)
self
.
tokenizer
.
save_pretrained
(
save_directory
)
def
transform
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
return
self
(
*
texts
,
**
kwargs
)
def
predict
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
return
self
(
*
texts
,
**
kwargs
)
@
abstractmethod
def
__call__
(
self
,
*
texts
,
**
kwargs
):
raise
NotImplementedError
()
class
TextClassificationPipeline
(
Pipeline
):
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
,
nb_classes
:
int
=
2
):
super
().
__init__
(
model
,
tokenizer
)
if
nb_classes
<
2
:
raise
Exception
(
'Invalid parameter nb_classes. int >= 2 is required (got: {})'
.
format
(
nb_classes
))
self
.
_nb_classes
=
nb_classes
@
classmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
return
cls
(
model
,
tokenizer
,
**
kwargs
)
def
__call__
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
if
'X'
in
kwargs
and
not
texts
:
texts
=
kwargs
.
pop
(
'X'
)
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
texts
,
add_special_tokens
=
True
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
)
special_tokens_mask
=
inputs
.
pop
(
'special_tokens_mask'
)
if
is_tf_available
():
# TODO trace model
predictions
=
self
.
model
(
**
inputs
)[
0
]
else
:
import
torch
with
torch
.
no_grad
():
predictions
=
self
.
model
(
**
inputs
)[
0
]
return
predictions
.
numpy
().
tolist
()
class
QuestionAnsweringPipeline
(
Pipeline
):
@
classmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
pass
def
__call__
(
self
,
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
if
'X'
in
kwargs
and
not
texts
:
texts
=
kwargs
.
pop
(
'X'
)
if
not
isinstance
(
texts
,
(
tuple
,
list
)):
raise
Exception
(
'QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of tuple.'
)
if
not
isinstance
(
texts
,
list
):
texts
=
[
texts
]
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
texts
,
add_special_tokens
=
True
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
)
# Remove special_tokens_mask to avoid KeyError
_
=
inputs
.
pop
(
'special_tokens_mask'
)
if
is_tf_available
():
# TODO trace model
start
,
end
=
self
.
model
(
inputs
)
else
:
import
torch
with
torch
.
no_grad
():
# Retrieve the score for the context tokens only (removing question tokens)
start
,
end
=
self
.
model
(
**
inputs
)
start
,
end
=
start
.
cpu
().
numpy
(),
end
.
cpu
().
numpy
()
answers
=
[]
for
i
in
range
(
len
(
texts
)):
context_idx
=
inputs
[
'token_type_ids'
][
i
]
==
1
start_
,
end_
=
start
[
i
,
context_idx
],
end
[
i
,
context_idx
]
# Normalize logits and spans to retrieve the answer
start_
,
end_
=
self
.
decode
(
start_
,
end_
)
# Convert the answer (tokens) back to the original text
answers
+=
[{
'start'
:
start_
,
'end'
:
end_
,
'answer'
:
self
.
span_to_answer
(
texts
[
i
][
1
],
start_
,
end_
)
}]
return
answers
def
decode
(
self
,
start
:
np
.
ndarray
,
end
:
np
.
ndarray
)
->
Tuple
:
# Ensure we have batch axis
if
start
.
ndim
==
1
:
start
=
start
[
None
]
if
end
.
ndim
==
1
:
end
=
end
[
None
]
# Compute the score of each tuple(start, end) to be the real answer
outer
=
np
.
matmul
(
np
.
expand_dims
(
start
,
-
1
),
np
.
expand_dims
(
end
,
1
))
# Remove candidate with end < start and end - start > 15
candidates
=
np
.
tril
(
np
.
triu
(
outer
),
15
)
start
=
np
.
max
(
candidates
,
axis
=
2
).
argmax
(
-
1
)
end
=
np
.
max
(
candidates
,
axis
=
1
).
argmax
(
-
1
)
return
start
,
end
def
span_to_answer
(
self
,
text
:
str
,
start
:
int
,
end
:
int
):
words
,
token_idx
=
[],
0
for
i
,
word
in
enumerate
(
text
.
split
(
" "
)):
token
=
self
.
tokenizer
.
tokenize
(
word
)
# Append words if they are in the span
if
start
<=
token_idx
<=
end
:
words
+=
[
word
]
# Stop if we went over the end of the answer
if
token_idx
>
end
:
break
# Append the subtokenization length to the running index
token_idx
+=
len
(
token
)
# Join text with spaces
return
' '
.
join
(
words
)
# Register all the supported task here
SUPPORTED_TASKS
=
{
'text-classification'
:
{
'impl'
:
TextClassificationPipeline
,
'tf'
:
TFAutoModelForSequenceClassification
if
is_tf_available
()
else
None
,
'pt'
:
AutoModelForSequenceClassification
if
is_torch_available
()
else
None
},
'question-answering'
:
{
'impl'
:
QuestionAnsweringPipeline
,
'tf'
:
TFAutoModelForQuestionAnswering
if
is_tf_available
()
else
None
,
'pt'
:
AutoModelForQuestionAnswering
if
is_torch_available
()
else
None
}
}
def
pipeline
(
task
:
str
,
model
,
tokenizer
:
Optional
[
Union
[
str
,
PreTrainedTokenizer
]]
=
None
,
**
kwargs
)
->
Pipeline
:
"""
Utility factory method to build pipeline.
"""
# Try to infer tokenizer from model name (if provided as str)
if
tokenizer
is
None
and
isinstance
(
model
,
str
):
tokenizer
=
model
else
:
# Impossible to guest what is the right tokenizer here
raise
Exception
(
'Tokenizer cannot be None if provided model is a PreTrainedModel instance'
)
tokenizer
=
tokenizer
if
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
else
AutoTokenizer
.
from_pretrained
(
tokenizer
)
if
task
not
in
SUPPORTED_TASKS
:
raise
KeyError
(
"Unknown task {}, available tasks are {}"
.
format
(
task
,
list
(
SUPPORTED_TASKS
.
keys
())))
targeted_task
=
SUPPORTED_TASKS
[
task
]
task
,
allocator
=
targeted_task
[
'impl'
],
targeted_task
[
'tf'
]
if
is_tf_available
()
else
targeted_task
[
'pt'
]
model
=
allocator
.
from_pretrained
(
model
)
return
task
(
model
,
tokenizer
,
**
kwargs
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment