Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
02110485
Commit
02110485
authored
Dec 06, 2019
by
Morgan Funtowicz
Browse files
Added batching, topk, chars index and scores.
parent
e1d89cb2
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
86 additions
and
24 deletions
+86
-24
transformers/pipelines.py
transformers/pipelines.py
+86
-24
No files found.
transformers/pipelines.py
View file @
02110485
...
@@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
os
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
Union
,
Optional
,
Tuple
from
typing
import
Union
,
Optional
,
Tuple
,
List
,
Dict
import
numpy
as
np
import
numpy
as
np
...
@@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken
...
@@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken
if
is_tf_available
():
if
is_tf_available
():
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
else
:
if
is_torch_available
():
from
transformers
import
AutoModelForSequenceClassification
,
AutoModelForQuestionAnswering
from
transformers
import
AutoModelForSequenceClassification
,
AutoModelForQuestionAnswering
...
@@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline):
...
@@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline):
class
QuestionAnsweringPipeline
(
Pipeline
):
class
QuestionAnsweringPipeline
(
Pipeline
):
"""
Question Answering pipeling involving Tokenization and Inference.
TODO:
- top-k answers
- return start/end chars
- return score
"""
def
__init__
(
self
,
model
,
tokenizer
:
Optional
[
PreTrainedTokenizer
]):
super
().
__init__
(
model
,
tokenizer
)
@
staticmethod
def
create_sample
(
question
:
Union
[
str
,
List
[
str
]],
context
:
Union
[
str
,
List
[
str
]])
->
Union
[
dict
,
List
[
Dict
]]:
is_list
=
isinstance
(
question
,
list
)
if
is_list
:
return
[{
'question'
:
q
,
'context'
:
c
}
for
q
,
c
in
zip
(
question
,
context
)]
else
:
return
{
'question'
:
question
,
'context'
:
context
}
@
classmethod
@
classmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
pass
pass
def
__call__
(
self
,
texts
,
**
kwargs
):
def
__call__
(
self
,
*
texts
,
**
kwargs
):
# Set defaults values
kwargs
.
setdefault
(
'max_answer_len'
,
15
)
kwargs
.
setdefault
(
'topk'
,
1
)
if
kwargs
[
'topk'
]
<
1
:
raise
ValueError
(
'topk parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'topk'
]))
if
kwargs
[
'max_answer_len'
]
<
1
:
raise
ValueError
(
'max_answer_len parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'max_answer_len'
]))
# Tabular input
if
'question'
in
kwargs
and
'context'
in
kwargs
:
texts
=
QuestionAnsweringPipeline
.
create_sample
(
kwargs
[
'questions'
],
kwargs
[
'contexts'
])
elif
'data'
in
kwargs
:
texts
=
kwargs
[
'data'
]
# Generic compatibility with sklearn and Keras
# Generic compatibility with sklearn and Keras
if
'X'
in
kwargs
and
not
texts
:
el
if
'X'
in
kwargs
and
not
texts
:
texts
=
kwargs
.
pop
(
'X'
)
texts
=
kwargs
.
pop
(
'X'
)
else
:
(
texts
,
)
=
texts
if
not
isinstance
(
texts
,
(
tuple
,
list
)):
if
not
isinstance
(
texts
,
(
dict
,
list
)):
raise
Exception
(
'QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of
tuple
.'
)
raise
Exception
(
'QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of
dict
.'
)
if
not
isinstance
(
texts
,
list
):
if
not
isinstance
(
texts
,
list
):
texts
=
[
texts
]
texts
=
[
texts
]
# Map to tuple (question, context)
texts
=
[(
text
[
'question'
],
text
[
'context'
])
for
text
in
texts
]
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
texts
,
add_special_tokens
=
True
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
# texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
texts
,
add_special_tokens
=
True
,
return_tensors
=
'pt'
)
)
# Remove special_tokens_mask to avoid KeyError
# Remove special_tokens_mask to avoid KeyError
_
=
inputs
.
pop
(
'special_tokens_mask'
)
_
=
inputs
.
pop
(
'special_tokens_mask'
)
if
is_tf_available
():
# if is_tf_available():
if
False
:
# TODO trace model
# TODO trace model
start
,
end
=
self
.
model
(
inputs
)
start
,
end
=
self
.
model
(
inputs
)
else
:
else
:
...
@@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline):
start_
,
end_
=
start
[
i
,
context_idx
],
end
[
i
,
context_idx
]
start_
,
end_
=
start
[
i
,
context_idx
],
end
[
i
,
context_idx
]
# Normalize logits and spans to retrieve the answer
# Normalize logits and spans to retrieve the answer
start_
,
end_
=
self
.
decode
(
start_
,
end_
)
start_
=
np
.
exp
(
start_
)
/
np
.
sum
(
np
.
exp
(
start_
))
end_
=
np
.
exp
(
end_
)
/
np
.
sum
(
np
.
exp
(
end_
))
starts
,
ends
,
scores
=
self
.
decode
(
start_
,
end_
,
kwargs
[
'topk'
],
kwargs
[
'max_answer_len'
])
# Convert the answer (tokens) back to the original text
# Convert the answer (tokens) back to the original text
answers
+=
[{
answers
+=
[[
'start'
:
start_
,
{
**
{
'score'
:
score
},
**
self
.
span_to_answer
(
texts
[
i
][
1
],
s
,
e
)}
'end'
:
end_
,
for
s
,
e
,
score
in
zip
(
starts
,
ends
,
scores
)
'answer'
:
self
.
span_to_answer
(
texts
[
i
][
1
],
start_
,
end_
)
]]
}]
return
answers
return
answers
def
decode
(
self
,
start
:
np
.
ndarray
,
end
:
np
.
ndarray
)
->
Tuple
:
def
decode
(
self
,
start
:
np
.
ndarray
,
end
:
np
.
ndarray
,
topk
:
int
,
max_answer_len
:
int
)
->
Tuple
:
# Ensure we have batch axis
# Ensure we have batch axis
if
start
.
ndim
==
1
:
if
start
.
ndim
==
1
:
start
=
start
[
None
]
start
=
start
[
None
]
...
@@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline):
# Compute the score of each tuple(start, end) to be the real answer
# Compute the score of each tuple(start, end) to be the real answer
outer
=
np
.
matmul
(
np
.
expand_dims
(
start
,
-
1
),
np
.
expand_dims
(
end
,
1
))
outer
=
np
.
matmul
(
np
.
expand_dims
(
start
,
-
1
),
np
.
expand_dims
(
end
,
1
))
# Remove candidate with end < start and end - start > 15
# Remove candidate with end < start and end - start > max_answer_len
candidates
=
np
.
tril
(
np
.
triu
(
outer
),
15
)
candidates
=
np
.
tril
(
np
.
triu
(
outer
),
max_answer_len
-
1
)
# start = np.max(candidates, axis=2).argmax(-1)
# end = np.max(candidates, axis=1).argmax(-1)
start
=
np
.
max
(
candidates
,
axis
=
2
).
argmax
(
-
1
)
scores_flat
=
candidates
.
flatten
()
end
=
np
.
max
(
candidates
,
axis
=
1
).
argmax
(
-
1
)
if
topk
==
1
:
idx_sort
=
[
np
.
argmax
(
scores_flat
)]
elif
len
(
scores_flat
)
<
topk
:
idx_sort
=
np
.
argsort
(
-
scores_flat
)
else
:
idx
=
np
.
argpartition
(
-
scores_flat
,
topk
)[
0
:
topk
]
idx_sort
=
idx
[
np
.
argsort
(
-
scores_flat
[
idx
])]
return
start
,
end
start
,
end
=
np
.
unravel_index
(
idx_sort
,
candidates
.
shape
)[
1
:]
return
start
,
end
,
candidates
[
0
,
start
,
end
]
def
span_to_answer
(
self
,
text
:
str
,
start
:
int
,
end
:
int
):
def
span_to_answer
(
self
,
text
:
str
,
start
:
int
,
end
:
int
):
words
,
token_idx
=
[],
0
words
=
[]
token_idx
=
char_start_idx
=
char_end_idx
=
chars_idx
=
0
for
i
,
word
in
enumerate
(
text
.
split
(
" "
)):
for
i
,
word
in
enumerate
(
text
.
split
(
" "
)):
token
=
self
.
tokenizer
.
tokenize
(
word
)
token
=
self
.
tokenizer
.
tokenize
(
word
)
# Append words if they are in the span
# Append words if they are in the span
if
start
<=
token_idx
<=
end
:
if
start
<=
token_idx
<=
end
:
if
token_idx
==
start
:
char_start_idx
=
chars_idx
if
token_idx
==
end
:
char_end_idx
=
chars_idx
+
len
(
word
)
words
+=
[
word
]
words
+=
[
word
]
# Stop if we went over the end of the answer
# Stop if we went over the end of the answer
...
@@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline):
# Append the subtokenization length to the running index
# Append the subtokenization length to the running index
token_idx
+=
len
(
token
)
token_idx
+=
len
(
token
)
chars_idx
+=
len
(
word
)
+
1
# Join text with spaces
# Join text with spaces
return
' '
.
join
(
words
)
return
{
'answer'
:
' '
.
join
(
words
)
,
'start'
:
max
(
0
,
char_start_idx
),
'end'
:
min
(
len
(
text
),
char_end_idx
)}
# Register all the supported task here
# Register all the supported task here
...
@@ -193,7 +254,7 @@ SUPPORTED_TASKS = {
...
@@ -193,7 +254,7 @@ SUPPORTED_TASKS = {
},
},
'question-answering'
:
{
'question-answering'
:
{
'impl'
:
QuestionAnsweringPipeline
,
'impl'
:
QuestionAnsweringPipeline
,
'tf'
:
TFAutoModelForQuestionAnswering
if
is_tf_available
()
else
None
,
#
'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
'pt'
:
AutoModelForQuestionAnswering
if
is_torch_available
()
else
None
'pt'
:
AutoModelForQuestionAnswering
if
is_torch_available
()
else
None
}
}
}
}
...
@@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize
...
@@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize
raise
KeyError
(
"Unknown task {}, available tasks are {}"
.
format
(
task
,
list
(
SUPPORTED_TASKS
.
keys
())))
raise
KeyError
(
"Unknown task {}, available tasks are {}"
.
format
(
task
,
list
(
SUPPORTED_TASKS
.
keys
())))
targeted_task
=
SUPPORTED_TASKS
[
task
]
targeted_task
=
SUPPORTED_TASKS
[
task
]
task
,
allocator
=
targeted_task
[
'impl'
],
targeted_task
[
'tf'
]
if
is_tf_available
()
else
targeted_task
[
'pt'
]
# task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt']
task
,
allocator
=
targeted_task
[
'impl'
],
targeted_task
[
'pt'
]
model
=
allocator
.
from_pretrained
(
model
)
model
=
allocator
.
from_pretrained
(
model
)
return
task
(
model
,
tokenizer
,
**
kwargs
)
return
task
(
model
,
tokenizer
,
**
kwargs
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment