Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
02110485
"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "79588e6fdb5af8add092fc27dd695ea1ebc68b18"
Commit
02110485
authored
Dec 06, 2019
by
Morgan Funtowicz
Browse files
Added batching, topk, chars index and scores.
parent
e1d89cb2
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
86 additions
and
24 deletions
+86
-24
transformers/pipelines.py
transformers/pipelines.py
+86
-24
No files found.
transformers/pipelines.py
View file @
02110485
...
@@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
os
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
Union
,
Optional
,
Tuple
from
typing
import
Union
,
Optional
,
Tuple
,
List
,
Dict
import
numpy
as
np
import
numpy
as
np
...
@@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken
...
@@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken
if
is_tf_available
():
if
is_tf_available
():
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
else
:
if
is_torch_available
():
from
transformers
import
AutoModelForSequenceClassification
,
AutoModelForQuestionAnswering
from
transformers
import
AutoModelForSequenceClassification
,
AutoModelForQuestionAnswering
...
@@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline):
...
@@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline):
class
QuestionAnsweringPipeline
(
Pipeline
):
class
QuestionAnsweringPipeline
(
Pipeline
):
"""
Question Answering pipeling involving Tokenization and Inference.
TODO:
- top-k answers
- return start/end chars
- return score
"""
def
__init__
(
self
,
model
,
tokenizer
:
Optional
[
PreTrainedTokenizer
]):
super
().
__init__
(
model
,
tokenizer
)
@
staticmethod
def
create_sample
(
question
:
Union
[
str
,
List
[
str
]],
context
:
Union
[
str
,
List
[
str
]])
->
Union
[
dict
,
List
[
Dict
]]:
is_list
=
isinstance
(
question
,
list
)
if
is_list
:
return
[{
'question'
:
q
,
'context'
:
c
}
for
q
,
c
in
zip
(
question
,
context
)]
else
:
return
{
'question'
:
question
,
'context'
:
context
}
@
classmethod
@
classmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
pass
pass
def
__call__
(
self
,
texts
,
**
kwargs
):
def
__call__
(
self
,
*
texts
,
**
kwargs
):
# Set defaults values
kwargs
.
setdefault
(
'max_answer_len'
,
15
)
kwargs
.
setdefault
(
'topk'
,
1
)
if
kwargs
[
'topk'
]
<
1
:
raise
ValueError
(
'topk parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'topk'
]))
if
kwargs
[
'max_answer_len'
]
<
1
:
raise
ValueError
(
'max_answer_len parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'max_answer_len'
]))
# Tabular input
if
'question'
in
kwargs
and
'context'
in
kwargs
:
texts
=
QuestionAnsweringPipeline
.
create_sample
(
kwargs
[
'questions'
],
kwargs
[
'contexts'
])
elif
'data'
in
kwargs
:
texts
=
kwargs
[
'data'
]
# Generic compatibility with sklearn and Keras
# Generic compatibility with sklearn and Keras
if
'X'
in
kwargs
and
not
texts
:
el
if
'X'
in
kwargs
and
not
texts
:
texts
=
kwargs
.
pop
(
'X'
)
texts
=
kwargs
.
pop
(
'X'
)
else
:
(
texts
,
)
=
texts
if
not
isinstance
(
texts
,
(
tuple
,
list
)):
if
not
isinstance
(
texts
,
(
dict
,
list
)):
raise
Exception
(
'QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of
tuple
.'
)
raise
Exception
(
'QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of
dict
.'
)
if
not
isinstance
(
texts
,
list
):
if
not
isinstance
(
texts
,
list
):
texts
=
[
texts
]
texts
=
[
texts
]
# Map to tuple (question, context)
texts
=
[(
text
[
'question'
],
text
[
'context'
])
for
text
in
texts
]
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
texts
,
add_special_tokens
=
True
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
# texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
texts
,
add_special_tokens
=
True
,
return_tensors
=
'pt'
)
)
# Remove special_tokens_mask to avoid KeyError
# Remove special_tokens_mask to avoid KeyError
_
=
inputs
.
pop
(
'special_tokens_mask'
)
_
=
inputs
.
pop
(
'special_tokens_mask'
)
if
is_tf_available
():
# if is_tf_available():
if
False
:
# TODO trace model
# TODO trace model
start
,
end
=
self
.
model
(
inputs
)
start
,
end
=
self
.
model
(
inputs
)
else
:
else
:
...
@@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline):
start_
,
end_
=
start
[
i
,
context_idx
],
end
[
i
,
context_idx
]
start_
,
end_
=
start
[
i
,
context_idx
],
end
[
i
,
context_idx
]
# Normalize logits and spans to retrieve the answer
# Normalize logits and spans to retrieve the answer
start_
,
end_
=
self
.
decode
(
start_
,
end_
)
start_
=
np
.
exp
(
start_
)
/
np
.
sum
(
np
.
exp
(
start_
))
end_
=
np
.
exp
(
end_
)
/
np
.
sum
(
np
.
exp
(
end_
))
starts
,
ends
,
scores
=
self
.
decode
(
start_
,
end_
,
kwargs
[
'topk'
],
kwargs
[
'max_answer_len'
])
# Convert the answer (tokens) back to the original text
# Convert the answer (tokens) back to the original text
answers
+=
[{
answers
+=
[[
'start'
:
start_
,
{
**
{
'score'
:
score
},
**
self
.
span_to_answer
(
texts
[
i
][
1
],
s
,
e
)}
'end'
:
end_
,
for
s
,
e
,
score
in
zip
(
starts
,
ends
,
scores
)
'answer'
:
self
.
span_to_answer
(
texts
[
i
][
1
],
start_
,
end_
)
]]
}]
return
answers
return
answers
def
decode
(
self
,
start
:
np
.
ndarray
,
end
:
np
.
ndarray
)
->
Tuple
:
def
decode
(
self
,
start
:
np
.
ndarray
,
end
:
np
.
ndarray
,
topk
:
int
,
max_answer_len
:
int
)
->
Tuple
:
# Ensure we have batch axis
# Ensure we have batch axis
if
start
.
ndim
==
1
:
if
start
.
ndim
==
1
:
start
=
start
[
None
]
start
=
start
[
None
]
...
@@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline):
# Compute the score of each tuple(start, end) to be the real answer
# Compute the score of each tuple(start, end) to be the real answer
outer
=
np
.
matmul
(
np
.
expand_dims
(
start
,
-
1
),
np
.
expand_dims
(
end
,
1
))
outer
=
np
.
matmul
(
np
.
expand_dims
(
start
,
-
1
),
np
.
expand_dims
(
end
,
1
))
# Remove candidate with end < start and end - start > 15
# Remove candidate with end < start and end - start > max_answer_len
candidates
=
np
.
tril
(
np
.
triu
(
outer
),
15
)
candidates
=
np
.
tril
(
np
.
triu
(
outer
),
max_answer_len
-
1
)
# start = np.max(candidates, axis=2).argmax(-1)
# end = np.max(candidates, axis=1).argmax(-1)
start
=
np
.
max
(
candidates
,
axis
=
2
).
argmax
(
-
1
)
scores_flat
=
candidates
.
flatten
()
end
=
np
.
max
(
candidates
,
axis
=
1
).
argmax
(
-
1
)
if
topk
==
1
:
idx_sort
=
[
np
.
argmax
(
scores_flat
)]
elif
len
(
scores_flat
)
<
topk
:
idx_sort
=
np
.
argsort
(
-
scores_flat
)
else
:
idx
=
np
.
argpartition
(
-
scores_flat
,
topk
)[
0
:
topk
]
idx_sort
=
idx
[
np
.
argsort
(
-
scores_flat
[
idx
])]
return
start
,
end
start
,
end
=
np
.
unravel_index
(
idx_sort
,
candidates
.
shape
)[
1
:]
return
start
,
end
,
candidates
[
0
,
start
,
end
]
def
span_to_answer
(
self
,
text
:
str
,
start
:
int
,
end
:
int
):
def
span_to_answer
(
self
,
text
:
str
,
start
:
int
,
end
:
int
):
words
,
token_idx
=
[],
0
words
=
[]
token_idx
=
char_start_idx
=
char_end_idx
=
chars_idx
=
0
for
i
,
word
in
enumerate
(
text
.
split
(
" "
)):
for
i
,
word
in
enumerate
(
text
.
split
(
" "
)):
token
=
self
.
tokenizer
.
tokenize
(
word
)
token
=
self
.
tokenizer
.
tokenize
(
word
)
# Append words if they are in the span
# Append words if they are in the span
if
start
<=
token_idx
<=
end
:
if
start
<=
token_idx
<=
end
:
if
token_idx
==
start
:
char_start_idx
=
chars_idx
if
token_idx
==
end
:
char_end_idx
=
chars_idx
+
len
(
word
)
words
+=
[
word
]
words
+=
[
word
]
# Stop if we went over the end of the answer
# Stop if we went over the end of the answer
...
@@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline):
# Append the subtokenization length to the running index
# Append the subtokenization length to the running index
token_idx
+=
len
(
token
)
token_idx
+=
len
(
token
)
chars_idx
+=
len
(
word
)
+
1
# Join text with spaces
# Join text with spaces
return
' '
.
join
(
words
)
return
{
'answer'
:
' '
.
join
(
words
)
,
'start'
:
max
(
0
,
char_start_idx
),
'end'
:
min
(
len
(
text
),
char_end_idx
)}
# Register all the supported task here
# Register all the supported task here
...
@@ -193,7 +254,7 @@ SUPPORTED_TASKS = {
...
@@ -193,7 +254,7 @@ SUPPORTED_TASKS = {
},
},
'question-answering'
:
{
'question-answering'
:
{
'impl'
:
QuestionAnsweringPipeline
,
'impl'
:
QuestionAnsweringPipeline
,
'tf'
:
TFAutoModelForQuestionAnswering
if
is_tf_available
()
else
None
,
#
'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
'pt'
:
AutoModelForQuestionAnswering
if
is_torch_available
()
else
None
'pt'
:
AutoModelForQuestionAnswering
if
is_torch_available
()
else
None
}
}
}
}
...
@@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize
...
@@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize
raise
KeyError
(
"Unknown task {}, available tasks are {}"
.
format
(
task
,
list
(
SUPPORTED_TASKS
.
keys
())))
raise
KeyError
(
"Unknown task {}, available tasks are {}"
.
format
(
task
,
list
(
SUPPORTED_TASKS
.
keys
())))
targeted_task
=
SUPPORTED_TASKS
[
task
]
targeted_task
=
SUPPORTED_TASKS
[
task
]
task
,
allocator
=
targeted_task
[
'impl'
],
targeted_task
[
'tf'
]
if
is_tf_available
()
else
targeted_task
[
'pt'
]
# task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt']
task
,
allocator
=
targeted_task
[
'impl'
],
targeted_task
[
'pt'
]
model
=
allocator
.
from_pretrained
(
model
)
model
=
allocator
.
from_pretrained
(
model
)
return
task
(
model
,
tokenizer
,
**
kwargs
)
return
task
(
model
,
tokenizer
,
**
kwargs
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment