Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
be5bf7b8
Commit
be5bf7b8
authored
Dec 13, 2019
by
Morgan Funtowicz
Browse files
Added NER pipeline.
parent
80eacb8f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
388 additions
and
332 deletions
+388
-332
transformers/pipelines.py
transformers/pipelines.py
+388
-332
No files found.
transformers/pipelines.py
View file @
be5bf7b8
# coding=utf-8
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
# Copyright 2018 The HuggingFace Inc. team.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
Union
,
Optional
,
Tuple
,
List
,
Dict
from
itertools
import
groupby
from
typing
import
Union
,
Optional
,
Tuple
,
List
,
Dict
import
numpy
as
np
import
numpy
as
np
from
transformers
import
is_tf_available
,
is_torch_available
,
logger
,
AutoTokenizer
,
PreTrainedTokenizer
,
\
SquadExample
,
squad_convert_examples_to_features
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizer
,
PretrainedConfig
,
\
SquadExample
,
squad_convert_examples_to_features
,
is_tf_available
,
is_torch_available
,
logger
if
is_tf_available
():
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
if
is_tf_available
():
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
,
TFAutoModelForTokenClassification
if
is_torch_available
():
from
transformers
import
AutoModelForSequenceClassification
,
AutoModelForQuestionAnswering
if
is_torch_available
():
import
torch
from
transformers
import
AutoModelForSequenceClassification
,
AutoModelForQuestionAnswering
,
AutoModelForTokenClassification
class
Pipeline
(
ABC
):
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
=
None
,
**
kwargs
):
self
.
model
=
model
class
Pipeline
(
ABC
):
self
.
tokenizer
=
tokenizer
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
=
None
,
**
kwargs
):
self
.
model
=
model
@
classmethod
self
.
tokenizer
=
tokenizer
@
abstractmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
@
classmethod
raise
NotImplementedError
()
@
abstractmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
def
save_pretrained
(
self
,
save_directory
):
raise
NotImplementedError
()
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Provided path ({}) should be a directory"
.
format
(
save_directory
))
def
save_pretrained
(
self
,
save_directory
):
return
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Provided path ({}) should be a directory"
.
format
(
save_directory
))
self
.
model
.
save_pretrained
(
save_directory
)
return
self
.
tokenizer
.
save_pretrained
(
save_directory
)
self
.
model
.
save_pretrained
(
save_directory
)
def
transform
(
self
,
*
texts
,
**
kwargs
):
self
.
tokenizer
.
save_pretrained
(
save_directory
)
# Generic compatibility with sklearn and Keras
return
self
(
*
texts
,
**
kwargs
)
def
transform
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
def
predict
(
self
,
*
texts
,
**
kwargs
):
return
self
(
*
texts
,
**
kwargs
)
# Generic compatibility with sklearn and Keras
return
self
(
*
texts
,
**
kwargs
)
def
predict
(
self
,
*
texts
,
**
kwargs
):
# Generic compatibility with sklearn and Keras
@
abstractmethod
return
self
(
*
texts
,
**
kwargs
)
def
__call__
(
self
,
*
texts
,
**
kwargs
):
raise
NotImplementedError
()
@
abstractmethod
def
__call__
(
self
,
*
texts
,
**
kwargs
):
raise
NotImplementedError
()
class
TextClassificationPipeline
(
Pipeline
):
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
,
nb_classes
:
int
=
2
):
super
().
__init__
(
model
,
tokenizer
)
class
TextClassificationPipeline
(
Pipeline
):
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
,
nb_classes
:
int
=
2
):
if
nb_classes
<
2
:
super
().
__init__
(
model
,
tokenizer
)
raise
Exception
(
'Invalid parameter nb_classes. int >= 2 is required (got: {})'
.
format
(
nb_classes
))
self
.
_nb_classes
=
nb_classes
if
nb_classes
<
2
:
raise
Exception
(
'Invalid parameter nb_classes. int >= 2 is required (got: {})'
.
format
(
nb_classes
))
@
classmethod
self
.
_nb_classes
=
nb_classes
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
return
cls
(
model
,
tokenizer
,
**
kwargs
)
@
classmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
def
__call__
(
self
,
*
texts
,
**
kwargs
):
return
cls
(
model
,
tokenizer
,
**
kwargs
)
# Generic compatibility with sklearn and Keras
if
'X'
in
kwargs
and
not
texts
:
def
__call__
(
self
,
*
texts
,
**
kwargs
):
texts
=
kwargs
.
pop
(
'X'
)
# Generic compatibility with sklearn and Keras
if
'X'
in
kwargs
and
not
texts
:
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
texts
=
kwargs
.
pop
(
'X'
)
texts
,
add_special_tokens
=
True
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
)
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
texts
,
add_special_tokens
=
True
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
special_tokens_mask
=
inputs
.
pop
(
'special_tokens_mask'
)
)
if
is_tf_available
():
special_tokens_mask
=
inputs
.
pop
(
'special_tokens_mask'
)
# TODO trace model
predictions
=
self
.
model
(
**
inputs
)[
0
]
if
is_tf_available
():
else
:
# TODO trace model
import
torch
predictions
=
self
.
model
(
**
inputs
)[
0
]
with
torch
.
no_grad
():
else
:
predictions
=
self
.
model
(
**
inputs
)[
0
]
import
torch
with
torch
.
no_grad
():
return
predictions
.
numpy
().
tolist
()
predictions
=
self
.
model
(
**
inputs
)[
0
]
return
predictions
.
numpy
().
tolist
()
class
QuestionAnsweringPipeline
(
Pipeline
):
"""
Question Answering pipeling involving Tokenization and Inference.
class
NerPipeline
(
Pipeline
):
"""
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
):
@
classmethod
super
().
__init__
(
model
,
tokenizer
)
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
pass
@
classmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
@
staticmethod
pass
def
create_sample
(
question
:
Union
[
str
,
List
[
str
]],
context
:
Union
[
str
,
List
[
str
]])
->
Union
[
SquadExample
,
List
[
SquadExample
]]:
is_list
=
isinstance
(
question
,
list
)
def
__call__
(
self
,
*
texts
,
**
kwargs
):
(
texts
,
),
answers
=
texts
,
[]
if
is_list
:
return
[
SquadExample
(
None
,
q
,
c
,
None
,
None
,
None
)
for
q
,
c
in
zip
(
question
,
context
)]
for
sentence
in
texts
:
else
:
return
SquadExample
(
None
,
question
,
context
,
None
,
None
,
None
)
# Ugly token to word idx mapping (for now)
token_to_word
,
words
=
[],
sentence
.
split
(
' '
)
@
staticmethod
for
i
,
w
in
enumerate
(
words
):
def
handle_args
(
*
inputs
,
**
kwargs
)
->
List
[
SquadExample
]:
tokens
=
self
.
tokenizer
.
tokenize
(
w
)
# Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
token_to_word
+=
[
i
]
*
len
(
tokens
)
if
inputs
is
not
None
and
len
(
inputs
)
>
1
:
tokens
=
self
.
tokenizer
.
encode_plus
(
sentence
,
return_attention_mask
=
False
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
)
kwargs
[
'X'
]
=
inputs
# Forward
# Generic compatibility with sklearn and Keras
if
is_torch_available
():
# Batched data
with
torch
.
no_grad
():
if
'X'
in
kwargs
or
'data'
in
kwargs
:
entities
=
self
.
model
(
**
tokens
)[
0
][
0
].
cpu
().
numpy
()
data
=
kwargs
[
'X'
]
if
'X'
in
kwargs
else
kwargs
[
'data'
]
else
:
entities
=
self
.
model
(
tokens
)[
0
][
0
].
numpy
()
if
not
isinstance
(
data
,
list
):
data
=
[
data
]
# Normalize scores
answer
,
token_start
=
[],
1
for
i
,
item
in
enumerate
(
data
):
for
idx
,
word
in
groupby
(
token_to_word
[
1
:
-
1
]):
if
isinstance
(
item
,
dict
):
if
any
(
k
not
in
item
for
k
in
[
'question'
,
'context'
]):
# Sum log prob over token, then normalize across labels
raise
KeyError
(
'You need to provide a dictionary with keys {question:..., context:...}'
)
score
=
np
.
exp
(
entities
[
token_start
])
/
np
.
exp
(
entities
[
token_start
]).
sum
(
-
1
,
keepdims
=
True
)
data
[
i
]
=
QuestionAnsweringPipeline
.
create_sample
(
**
item
)
label_idx
=
score
.
argmax
()
elif
isinstance
(
item
,
SquadExample
):
answer
+=
[{
continue
'word'
:
words
[
idx
-
1
],
'score'
:
score
[
label_idx
],
'entity'
:
self
.
model
.
config
.
id2label
[
label_idx
]
else
:
}]
raise
ValueError
(
'{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)'
# Update token start
.
format
(
'X'
if
'X'
in
kwargs
else
'data'
)
token_start
+=
len
(
list
(
word
))
)
inputs
=
data
# Append
answers
+=
[
answer
]
# Tabular input
return
answers
elif
'question'
in
kwargs
and
'context'
in
kwargs
:
if
isinstance
(
kwargs
[
'question'
],
str
):
kwargs
[
'question'
]
=
[
kwargs
[
'question'
]]
class
QuestionAnsweringPipeline
(
Pipeline
):
"""
if
isinstance
(
kwargs
[
'context'
],
str
):
Question Answering pipeline involving Tokenization and Inference.
kwargs
[
'context'
]
=
[
kwargs
[
'context'
]]
"""
inputs
=
[
QuestionAnsweringPipeline
.
create_sample
(
q
,
c
)
for
q
,
c
in
zip
(
kwargs
[
'question'
],
kwargs
[
'context'
])]
@
classmethod
else
:
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
raise
ValueError
(
'Unknown arguments {}'
.
format
(
kwargs
))
pass
if
not
isinstance
(
inputs
,
list
):
@
staticmethod
inputs
=
[
inputs
]
def
create_sample
(
question
:
Union
[
str
,
List
[
str
]],
context
:
Union
[
str
,
List
[
str
]])
->
Union
[
SquadExample
,
List
[
SquadExample
]]:
is_list
=
isinstance
(
question
,
list
)
return
inputs
if
is_list
:
def
__init__
(
self
,
model
,
tokenizer
:
Optional
[
PreTrainedTokenizer
]):
return
[
SquadExample
(
None
,
q
,
c
,
None
,
None
,
None
)
for
q
,
c
in
zip
(
question
,
context
)]
super
().
__init__
(
model
,
tokenizer
)
else
:
return
SquadExample
(
None
,
question
,
context
,
None
,
None
,
None
)
def
inputs_for_model
(
self
,
features
:
Union
[
SquadExample
,
List
[
SquadExample
]])
->
Dict
:
args
=
[
'input_ids'
,
'attention_mask'
]
@
staticmethod
model_type
=
type
(
self
.
model
).
__name__
.
lower
()
def
handle_args
(
*
inputs
,
**
kwargs
)
->
List
[
SquadExample
]:
# Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
if
'distilbert'
not
in
model_type
and
'xlm'
not
in
model_type
:
if
inputs
is
not
None
and
len
(
inputs
)
>
1
:
args
+=
[
'token_type_ids'
]
kwargs
[
'X'
]
=
inputs
if
'xlnet'
in
model_type
or
'xlm'
in
model_type
:
# Generic compatibility with sklearn and Keras
args
+=
[
'cls_index'
,
'p_mask'
]
# Batched data
if
'X'
in
kwargs
or
'data'
in
kwargs
:
if
isinstance
(
features
,
SquadExample
):
data
=
kwargs
[
'X'
]
if
'X'
in
kwargs
else
kwargs
[
'data'
]
return
{
k
:
features
.
__dict__
[
k
]
for
k
in
args
}
else
:
if
not
isinstance
(
data
,
list
):
return
{
k
:
[
feature
.
__dict__
[
k
]
for
feature
in
features
]
for
k
in
args
}
data
=
[
data
]
def
__call__
(
self
,
*
texts
,
**
kwargs
):
for
i
,
item
in
enumerate
(
data
):
# Set defaults values
if
isinstance
(
item
,
dict
):
kwargs
.
setdefault
(
'topk'
,
1
)
if
any
(
k
not
in
item
for
k
in
[
'question'
,
'context'
]):
kwargs
.
setdefault
(
'doc_stride'
,
128
)
raise
KeyError
(
'You need to provide a dictionary with keys {question:..., context:...}'
)
kwargs
.
setdefault
(
'max_answer_len'
,
15
)
data
[
i
]
=
QuestionAnsweringPipeline
.
create_sample
(
**
item
)
kwargs
.
setdefault
(
'max_seq_len'
,
384
)
kwargs
.
setdefault
(
'max_question_len'
,
64
)
elif
isinstance
(
item
,
SquadExample
):
continue
if
kwargs
[
'topk'
]
<
1
:
else
:
raise
ValueError
(
'topk parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'topk'
]))
raise
ValueError
(
'{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)'
if
kwargs
[
'max_answer_len'
]
<
1
:
.
format
(
'X'
if
'X'
in
kwargs
else
'data'
)
raise
ValueError
(
'max_answer_len parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'max_answer_len'
]))
)
inputs
=
data
examples
=
QuestionAnsweringPipeline
.
handle_args
(
texts
,
**
kwargs
)
# Tabular input
# Convert inputs to features
elif
'question'
in
kwargs
and
'context'
in
kwargs
:
features
=
squad_convert_examples_to_features
(
examples
,
self
.
tokenizer
,
kwargs
[
'max_seq_len'
],
kwargs
[
'doc_stride'
],
kwargs
[
'max_question_len'
],
False
)
if
isinstance
(
kwargs
[
'question'
],
str
):
fw_args
=
self
.
inputs_for_model
(
features
)
kwargs
[
'question'
]
=
[
kwargs
[
'question'
]]
if
is_tf_available
():
if
isinstance
(
kwargs
[
'context'
],
str
):
import
tensorflow
as
tf
kwargs
[
'context'
]
=
[
kwargs
[
'context'
]]
fw_args
=
{
k
:
tf
.
constant
(
v
)
for
(
k
,
v
)
in
fw_args
.
items
()}
start
,
end
=
self
.
model
(
fw_args
)
inputs
=
[
QuestionAnsweringPipeline
.
create_sample
(
q
,
c
)
for
q
,
c
in
zip
(
kwargs
[
'question'
],
kwargs
[
'context'
])]
start
,
end
=
start
.
numpy
(),
end
.
numpy
()
else
:
else
:
raise
ValueError
(
'Unknown arguments {}'
.
format
(
kwargs
))
import
torch
with
torch
.
no_grad
():
if
not
isinstance
(
inputs
,
list
):
# Retrieve the score for the context tokens only (removing question tokens)
inputs
=
[
inputs
]
fw_args
=
{
k
:
torch
.
tensor
(
v
)
for
(
k
,
v
)
in
fw_args
.
items
()}
start
,
end
=
self
.
model
(
**
fw_args
)
return
inputs
start
,
end
=
start
.
cpu
().
numpy
(),
end
.
cpu
().
numpy
()
def
__init__
(
self
,
model
,
tokenizer
:
Optional
[
PreTrainedTokenizer
]):
answers
=
[]
super
().
__init__
(
model
,
tokenizer
)
for
(
example
,
feature
,
start_
,
end_
)
in
zip
(
examples
,
features
,
start
,
end
):
# Normalize logits and spans to retrieve the answer
def
inputs_for_model
(
self
,
features
:
Union
[
SquadExample
,
List
[
SquadExample
]])
->
Dict
:
start_
=
np
.
exp
(
start_
)
/
np
.
sum
(
np
.
exp
(
start_
))
args
=
[
'input_ids'
,
'attention_mask'
]
end_
=
np
.
exp
(
end_
)
/
np
.
sum
(
np
.
exp
(
end_
))
model_type
=
type
(
self
.
model
).
__name__
.
lower
()
# Mask padding and question
if
'distilbert'
not
in
model_type
and
'xlm'
not
in
model_type
:
start_
,
end_
=
start_
*
np
.
abs
(
np
.
array
(
feature
.
p_mask
)
-
1
),
end_
*
np
.
abs
(
np
.
array
(
feature
.
p_mask
)
-
1
)
args
+=
[
'token_type_ids'
]
# Mask CLS
if
'xlnet'
in
model_type
or
'xlm'
in
model_type
:
start_
[
0
]
=
end_
[
0
]
=
0
args
+=
[
'cls_index'
,
'p_mask'
]
starts
,
ends
,
scores
=
self
.
decode
(
start_
,
end_
,
kwargs
[
'topk'
],
kwargs
[
'max_answer_len'
])
if
isinstance
(
features
,
SquadExample
):
char_to_word
=
np
.
array
(
example
.
char_to_word_offset
)
return
{
k
:
features
.
__dict__
[
k
]
for
k
in
args
}
else
:
# Convert the answer (tokens) back to the original text
return
{
k
:
[
feature
.
__dict__
[
k
]
for
feature
in
features
]
for
k
in
args
}
answers
+=
[[
{
def
__call__
(
self
,
*
texts
,
**
kwargs
):
'score'
:
score
,
# Set defaults values
'start'
:
np
.
where
(
char_to_word
==
feature
.
token_to_orig_map
[
s
])[
0
][
0
],
kwargs
.
setdefault
(
'topk'
,
1
)
'end'
:
np
.
where
(
char_to_word
==
feature
.
token_to_orig_map
[
e
])[
0
][
-
1
],
kwargs
.
setdefault
(
'doc_stride'
,
128
)
'answer'
:
' '
.
join
(
example
.
doc_tokens
[
feature
.
token_to_orig_map
[
s
]:
feature
.
token_to_orig_map
[
e
]
+
1
])
kwargs
.
setdefault
(
'max_answer_len'
,
15
)
}
kwargs
.
setdefault
(
'max_seq_len'
,
384
)
for
s
,
e
,
score
in
zip
(
starts
,
ends
,
scores
)
kwargs
.
setdefault
(
'max_question_len'
,
64
)
]]
if
kwargs
[
'topk'
]
<
1
:
return
answers
raise
ValueError
(
'topk parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'topk'
]))
def
decode
(
self
,
start
:
np
.
ndarray
,
end
:
np
.
ndarray
,
topk
:
int
,
max_answer_len
:
int
)
->
Tuple
:
if
kwargs
[
'max_answer_len'
]
<
1
:
# Ensure we have batch axis
raise
ValueError
(
'max_answer_len parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'max_answer_len'
]))
if
start
.
ndim
==
1
:
start
=
start
[
None
]
examples
=
QuestionAnsweringPipeline
.
handle_args
(
texts
,
**
kwargs
)
if
end
.
ndim
==
1
:
# Convert inputs to features
end
=
end
[
None
]
features
=
squad_convert_examples_to_features
(
examples
,
self
.
tokenizer
,
kwargs
[
'max_seq_len'
],
kwargs
[
'doc_stride'
],
kwargs
[
'max_question_len'
],
False
)
fw_args
=
self
.
inputs_for_model
(
features
)
# Compute the score of each tuple(start, end) to be the real answer
outer
=
np
.
matmul
(
np
.
expand_dims
(
start
,
-
1
),
np
.
expand_dims
(
end
,
1
))
if
is_tf_available
():
import
tensorflow
as
tf
# Remove candidate with end < start and end - start > max_answer_len
fw_args
=
{
k
:
tf
.
constant
(
v
)
for
(
k
,
v
)
in
fw_args
.
items
()}
candidates
=
np
.
tril
(
np
.
triu
(
outer
),
max_answer_len
-
1
)
start
,
end
=
self
.
model
(
fw_args
)
start
,
end
=
start
.
numpy
(),
end
.
numpy
()
# Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
else
:
scores_flat
=
candidates
.
flatten
()
import
torch
if
topk
==
1
:
with
torch
.
no_grad
():
idx_sort
=
[
np
.
argmax
(
scores_flat
)]
# Retrieve the score for the context tokens only (removing question tokens)
elif
len
(
scores_flat
)
<
topk
:
fw_args
=
{
k
:
torch
.
tensor
(
v
)
for
(
k
,
v
)
in
fw_args
.
items
()}
idx_sort
=
np
.
argsort
(
-
scores_flat
)
start
,
end
=
self
.
model
(
**
fw_args
)
else
:
start
,
end
=
start
.
cpu
().
numpy
(),
end
.
cpu
().
numpy
()
idx
=
np
.
argpartition
(
-
scores_flat
,
topk
)[
0
:
topk
]
idx_sort
=
idx
[
np
.
argsort
(
-
scores_flat
[
idx
])]
answers
=
[]
for
(
example
,
feature
,
start_
,
end_
)
in
zip
(
examples
,
features
,
start
,
end
):
start
,
end
=
np
.
unravel_index
(
idx_sort
,
candidates
.
shape
)[
1
:]
# Normalize logits and spans to retrieve the answer
return
start
,
end
,
candidates
[
0
,
start
,
end
]
start_
=
np
.
exp
(
start_
)
/
np
.
sum
(
np
.
exp
(
start_
))
end_
=
np
.
exp
(
end_
)
/
np
.
sum
(
np
.
exp
(
end_
))
def
span_to_answer
(
self
,
text
:
str
,
start
:
int
,
end
:
int
):
words
=
[]
# Mask padding and question
token_idx
=
char_start_idx
=
char_end_idx
=
chars_idx
=
0
start_
,
end_
=
start_
*
np
.
abs
(
np
.
array
(
feature
.
p_mask
)
-
1
),
end_
*
np
.
abs
(
np
.
array
(
feature
.
p_mask
)
-
1
)
for
i
,
word
in
enumerate
(
text
.
split
(
" "
)):
# TODO : What happend if not possible
token
=
self
.
tokenizer
.
tokenize
(
word
)
# Mask CLS
start_
[
0
]
=
end_
[
0
]
=
0
# Append words if they are in the span
if
start
<=
token_idx
<=
end
:
starts
,
ends
,
scores
=
self
.
decode
(
start_
,
end_
,
kwargs
[
'topk'
],
kwargs
[
'max_answer_len'
])
if
token_idx
==
start
:
char_to_word
=
np
.
array
(
example
.
char_to_word_offset
)
char_start_idx
=
chars_idx
# Convert the answer (tokens) back to the original text
if
token_idx
==
end
:
answers
+=
[[
char_end_idx
=
chars_idx
+
len
(
word
)
{
'score'
:
score
,
words
+=
[
word
]
'start'
:
np
.
where
(
char_to_word
==
feature
.
token_to_orig_map
[
s
])[
0
][
0
],
'end'
:
np
.
where
(
char_to_word
==
feature
.
token_to_orig_map
[
e
])[
0
][
-
1
],
# Stop if we went over the end of the answer
'answer'
:
' '
.
join
(
example
.
doc_tokens
[
feature
.
token_to_orig_map
[
s
]:
feature
.
token_to_orig_map
[
e
]
+
1
])
if
token_idx
>
end
:
}
break
for
s
,
e
,
score
in
zip
(
starts
,
ends
,
scores
)
]]
# Append the subtokenization length to the running index
token_idx
+=
len
(
token
)
return
answers
chars_idx
+=
len
(
word
)
+
1
def
decode
(
self
,
start
:
np
.
ndarray
,
end
:
np
.
ndarray
,
topk
:
int
,
max_answer_len
:
int
)
->
Tuple
:
# Join text with spaces
# Ensure we have batch axis
return
{
'answer'
:
' '
.
join
(
words
),
'start'
:
max
(
0
,
char_start_idx
),
'end'
:
min
(
len
(
text
),
char_end_idx
)}
if
start
.
ndim
==
1
:
start
=
start
[
None
]
# Register all the supported task here
if
end
.
ndim
==
1
:
SUPPORTED_TASKS
=
{
end
=
end
[
None
]
'text-classification'
:
{
'impl'
:
TextClassificationPipeline
,
# Compute the score of each tuple(start, end) to be the real answer
'tf'
:
TFAutoModelForSequenceClassification
if
is_tf_available
()
else
None
,
outer
=
np
.
matmul
(
np
.
expand_dims
(
start
,
-
1
),
np
.
expand_dims
(
end
,
1
))
'pt'
:
AutoModelForSequenceClassification
if
is_torch_available
()
else
None
},
# Remove candidate with end < start and end - start > max_answer_len
'question-answering'
:
{
candidates
=
np
.
tril
(
np
.
triu
(
outer
),
max_answer_len
-
1
)
'impl'
:
QuestionAnsweringPipeline
,
'tf'
:
TFAutoModelForQuestionAnswering
if
is_tf_available
()
else
None
,
# Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
'pt'
:
AutoModelForQuestionAnswering
if
is_torch_available
()
else
None
scores_flat
=
candidates
.
flatten
()
}
if
topk
==
1
:
}
idx_sort
=
[
np
.
argmax
(
scores_flat
)]
elif
len
(
scores_flat
)
<
topk
:
idx_sort
=
np
.
argsort
(
-
scores_flat
)
def
pipeline
(
task
:
str
,
model
,
tokenizer
:
Optional
[
Union
[
str
,
PreTrainedTokenizer
]]
=
None
,
**
kwargs
)
->
Pipeline
:
else
:
"""
idx
=
np
.
argpartition
(
-
scores_flat
,
topk
)[
0
:
topk
]
Utility factory method to build pipeline.
idx_sort
=
idx
[
np
.
argsort
(
-
scores_flat
[
idx
])]
"""
# Try to infer tokenizer from model name (if provided as str)
start
,
end
=
np
.
unravel_index
(
idx_sort
,
candidates
.
shape
)[
1
:]
if
tokenizer
is
None
and
isinstance
(
model
,
str
):
return
start
,
end
,
candidates
[
0
,
start
,
end
]
tokenizer
=
model
else
:
def
span_to_answer
(
self
,
text
:
str
,
start
:
int
,
end
:
int
):
# Impossible to guest what is the right tokenizer here
words
=
[]
raise
Exception
(
'Tokenizer cannot be None if provided model is a PreTrainedModel instance'
)
token_idx
=
char_start_idx
=
char_end_idx
=
chars_idx
=
0
tokenizer
=
tokenizer
if
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
else
AutoTokenizer
.
from_pretrained
(
tokenizer
)
for
i
,
word
in
enumerate
(
text
.
split
(
" "
)):
token
=
self
.
tokenizer
.
tokenize
(
word
)
if
task
not
in
SUPPORTED_TASKS
:
raise
KeyError
(
"Unknown task {}, available tasks are {}"
.
format
(
task
,
list
(
SUPPORTED_TASKS
.
keys
())))
# Append words if they are in the span
if
start
<=
token_idx
<=
end
:
targeted_task
=
SUPPORTED_TASKS
[
task
]
if
token_idx
==
start
:
task
,
allocator
=
targeted_task
[
'impl'
],
targeted_task
[
'tf'
]
if
is_tf_available
()
else
targeted_task
[
'pt'
]
char_start_idx
=
chars_idx
model
=
allocator
.
from_pretrained
(
model
)
if
token_idx
==
end
:
return
task
(
model
,
tokenizer
,
**
kwargs
)
char_end_idx
=
chars_idx
+
len
(
word
)
words
+=
[
word
]
# Stop if we went over the end of the answer
if
token_idx
>
end
:
break
# Append the subtokenization length to the running index
token_idx
+=
len
(
token
)
chars_idx
+=
len
(
word
)
+
1
# Join text with spaces
return
{
'answer'
:
' '
.
join
(
words
),
'start'
:
max
(
0
,
char_start_idx
),
'end'
:
min
(
len
(
text
),
char_end_idx
)}
# Register all the supported task here
SUPPORTED_TASKS
=
{
'text-classification'
:
{
'impl'
:
TextClassificationPipeline
,
'tf'
:
TFAutoModelForSequenceClassification
if
is_tf_available
()
else
None
,
'pt'
:
AutoModelForSequenceClassification
if
is_torch_available
()
else
None
},
'ner'
:
{
'impl'
:
NerPipeline
,
'tf'
:
TFAutoModelForTokenClassification
if
is_tf_available
()
else
None
,
'pt'
:
AutoModelForTokenClassification
if
is_torch_available
()
else
None
,
},
'question-answering'
:
{
'impl'
:
QuestionAnsweringPipeline
,
'tf'
:
TFAutoModelForQuestionAnswering
if
is_tf_available
()
else
None
,
'pt'
:
AutoModelForQuestionAnswering
if
is_torch_available
()
else
None
}
}
def
pipeline
(
task
:
str
,
model
,
config
:
Optional
[
PretrainedConfig
]
=
None
,
tokenizer
:
Optional
[
Union
[
str
,
PreTrainedTokenizer
]]
=
None
,
**
kwargs
)
->
Pipeline
:
"""
Utility factory method to build pipeline.
"""
# Try to infer tokenizer from model name (if provided as str)
if
tokenizer
is
None
and
isinstance
(
model
,
str
):
tokenizer
=
model
else
:
# Impossible to guest what is the right tokenizer here
raise
Exception
(
'Tokenizer cannot be None if provided model is a PreTrainedModel instance'
)
tokenizer
=
tokenizer
if
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
else
AutoTokenizer
.
from_pretrained
(
tokenizer
)
if
task
not
in
SUPPORTED_TASKS
:
raise
KeyError
(
"Unknown task {}, available tasks are {}"
.
format
(
task
,
list
(
SUPPORTED_TASKS
.
keys
())))
targeted_task
=
SUPPORTED_TASKS
[
task
]
task
,
allocator
=
targeted_task
[
'impl'
],
targeted_task
[
'tf'
]
if
is_tf_available
()
else
targeted_task
[
'pt'
]
model
=
allocator
.
from_pretrained
(
model
)
return
task
(
model
,
tokenizer
,
**
kwargs
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment