Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
0558c9cb
"vscode:/vscode.git/clone" did not exist on "60ea6c59d24f63681e120e704d2f823bfcc2c04e"
Commit
0558c9cb
authored
Dec 10, 2019
by
thomwolf
Browse files
Merge branch 'master' into t5
parents
608a8f5b
e57d00ee
Changes
168
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1837 additions
and
24 deletions
+1837
-24
templates/adding_a_new_model/tests/modeling_xxx_test.py
templates/adding_a_new_model/tests/modeling_xxx_test.py
+8
-4
templates/adding_a_new_model/tokenization_xxx.py
templates/adding_a_new_model/tokenization_xxx.py
+1
-1
transformers-cli
transformers-cli
+23
-0
transformers/__init__.py
transformers/__init__.py
+38
-11
transformers/commands/__init__.py
transformers/commands/__init__.py
+12
-0
transformers/commands/user.py
transformers/commands/user.py
+165
-0
transformers/configuration_albert.py
transformers/configuration_albert.py
+100
-0
transformers/configuration_auto.py
transformers/configuration_auto.py
+16
-3
transformers/configuration_camembert.py
transformers/configuration_camembert.py
+33
-0
transformers/configuration_distilbert.py
transformers/configuration_distilbert.py
+3
-1
transformers/configuration_gpt2.py
transformers/configuration_gpt2.py
+1
-0
transformers/configuration_roberta.py
transformers/configuration_roberta.py
+2
-0
transformers/configuration_utils.py
transformers/configuration_utils.py
+6
-1
transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
...rmers/convert_albert_original_tf_checkpoint_to_pytorch.py
+67
-0
transformers/convert_pytorch_checkpoint_to_tf2.py
transformers/convert_pytorch_checkpoint_to_tf2.py
+5
-0
transformers/data/__init__.py
transformers/data/__init__.py
+4
-2
transformers/data/metrics/__init__.py
transformers/data/metrics/__init__.py
+8
-0
transformers/data/metrics/squad_metrics.py
transformers/data/metrics/squad_metrics.py
+758
-0
transformers/data/processors/__init__.py
transformers/data/processors/__init__.py
+2
-1
transformers/data/processors/squad.py
transformers/data/processors/squad.py
+585
-0
No files found.
templates/adding_a_new_model/tests/modeling_xxx_test.py
View file @
0558c9cb
...
@@ -18,12 +18,12 @@ from __future__ import print_function
...
@@ -18,12 +18,12 @@ from __future__ import print_function
import
unittest
import
unittest
import
shutil
import
shutil
import
pytest
from
transformers
import
is_torch_available
from
transformers
import
is_torch_available
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
if
is_torch_available
():
if
is_torch_available
():
from
transformers
import
(
XxxConfig
,
XxxModel
,
XxxForMaskedLM
,
from
transformers
import
(
XxxConfig
,
XxxModel
,
XxxForMaskedLM
,
...
@@ -31,10 +31,9 @@ if is_torch_available():
...
@@ -31,10 +31,9 @@ if is_torch_available():
XxxForQuestionAnswering
,
XxxForSequenceClassification
,
XxxForQuestionAnswering
,
XxxForSequenceClassification
,
XxxForTokenClassification
,
XxxForMultipleChoice
)
XxxForTokenClassification
,
XxxForMultipleChoice
)
from
transformers.modeling_xxx
import
XXX_PRETRAINED_MODEL_ARCHIVE_MAP
from
transformers.modeling_xxx
import
XXX_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
@
require_torch
class
XxxModelTest
(
CommonTestCases
.
CommonModelTester
):
class
XxxModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
XxxModel
,
XxxForMaskedLM
,
XxxForQuestionAnswering
,
all_model_classes
=
(
XxxModel
,
XxxForMaskedLM
,
XxxForQuestionAnswering
,
...
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
...
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xxx_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_xxx_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
XxxModel
(
config
=
config
)
model
=
XxxModel
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
...
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
...
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xxx_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_xxx_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
XxxForMaskedLM
(
config
=
config
)
model
=
XxxForMaskedLM
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
)
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
)
result
=
{
result
=
{
...
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
...
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xxx_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_xxx_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
XxxForQuestionAnswering
(
config
=
config
)
model
=
XxxForQuestionAnswering
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
...
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
...
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xxx_for_sequence_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_xxx_for_sequence_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
config
.
num_labels
=
self
.
num_labels
model
=
XxxForSequenceClassification
(
config
)
model
=
XxxForSequenceClassification
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
sequence_labels
)
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
sequence_labels
)
result
=
{
result
=
{
...
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
...
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xxx_for_token_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_xxx_for_token_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
config
.
num_labels
=
self
.
num_labels
model
=
XxxForTokenClassification
(
config
=
config
)
model
=
XxxForTokenClassification
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
token_labels
)
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
token_labels
)
result
=
{
result
=
{
...
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
...
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xxx_for_token_classification
(
*
config_and_inputs
)
self
.
model_tester
.
create_and_check_xxx_for_token_classification
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
XXX_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
for
model_name
in
list
(
XXX_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
templates/adding_a_new_model/tokenization_xxx.py
View file @
0558c9cb
...
@@ -172,7 +172,7 @@ class XxxTokenizer(PreTrainedTokenizer):
...
@@ -172,7 +172,7 @@ class XxxTokenizer(PreTrainedTokenizer):
special tokens for the model
special tokens for the model
Returns:
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
...
...
transformers-cli
0 → 100644
View file @
0558c9cb
#!/usr/bin/env python
from
argparse
import
ArgumentParser
from
transformers.commands.user
import
UserCommands
if
__name__
==
'__main__'
:
parser
=
ArgumentParser
(
description
=
'Transformers CLI tool'
,
usage
=
'transformers-cli <command> [<args>]'
)
commands_parser
=
parser
.
add_subparsers
(
help
=
'transformers-cli command helpers'
)
# Register commands
UserCommands
.
register_subcommand
(
commands_parser
)
# Let's go
args
=
parser
.
parse_args
()
if
not
hasattr
(
args
,
'func'
):
parser
.
print_help
()
exit
(
1
)
# Run
service
=
args
.
func
(
args
)
service
.
run
()
transformers/__init__.py
View file @
0558c9cb
__version__
=
"2.
1
.1"
__version__
=
"2.
2
.1"
# Work around to update TensorFlow's absl.logging threshold which alters the
# Work around to update TensorFlow's absl.logging threshold which alters the
# default Python logging output behavior when present.
# default Python logging output behavior when present.
...
@@ -25,10 +25,13 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH
...
@@ -25,10 +25,13 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH
from
.data
import
(
is_sklearn_available
,
from
.data
import
(
is_sklearn_available
,
InputExample
,
InputFeatures
,
DataProcessor
,
InputExample
,
InputFeatures
,
DataProcessor
,
glue_output_modes
,
glue_convert_examples_to_features
,
glue_output_modes
,
glue_convert_examples_to_features
,
glue_processors
,
glue_tasks_num_labels
)
glue_processors
,
glue_tasks_num_labels
,
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
,
squad_convert_examples_to_features
,
SquadFeatures
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
)
if
is_sklearn_available
():
if
is_sklearn_available
():
from
.data
import
glue_compute_metrics
from
.data
import
glue_compute_metrics
,
xnli_compute_metrics
# Tokenizers
# Tokenizers
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
...
@@ -42,6 +45,8 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
...
@@ -42,6 +45,8 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_albert
import
AlbertTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
from
.tokenization_t5
import
T5Tokenizer
from
.tokenization_t5
import
T5Tokenizer
# Configurations
# Configurations
...
@@ -56,6 +61,8 @@ from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MA
...
@@ -56,6 +61,8 @@ from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MA
from
.configuration_xlm
import
XLMConfig
,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_xlm
import
XLMConfig
,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_roberta
import
RobertaConfig
,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_roberta
import
RobertaConfig
,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_distilbert
import
DistilBertConfig
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_distilbert
import
DistilBertConfig
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_albert
import
AlbertConfig
,
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_camembert
import
CamembertConfig
,
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_t5
import
T5Config
,
T5_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_t5
import
T5Config
,
T5_PRETRAINED_CONFIG_ARCHIVE_MAP
# Modeling
# Modeling
...
@@ -73,7 +80,8 @@ if is_torch_available():
...
@@ -73,7 +80,8 @@ if is_torch_available():
OpenAIGPTLMHeadModel
,
OpenAIGPTDoubleHeadsModel
,
OpenAIGPTLMHeadModel
,
OpenAIGPTDoubleHeadsModel
,
load_tf_weights_in_openai_gpt
,
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
)
load_tf_weights_in_openai_gpt
,
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_transfo_xl
import
(
TransfoXLPreTrainedModel
,
TransfoXLModel
,
TransfoXLLMHeadModel
,
from
.modeling_transfo_xl
import
(
TransfoXLPreTrainedModel
,
TransfoXLModel
,
TransfoXLLMHeadModel
,
load_tf_weights_in_transfo_xl
,
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
)
AdaptiveEmbedding
,
load_tf_weights_in_transfo_xl
,
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_gpt2
import
(
GPT2PreTrainedModel
,
GPT2Model
,
from
.modeling_gpt2
import
(
GPT2PreTrainedModel
,
GPT2Model
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
,
load_tf_weights_in_gpt2
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
load_tf_weights_in_gpt2
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
...
@@ -81,9 +89,10 @@ if is_torch_available():
...
@@ -81,9 +89,10 @@ if is_torch_available():
CTRLLMHeadModel
,
CTRLLMHeadModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
)
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_xlnet
import
(
XLNetPreTrainedModel
,
XLNetModel
,
XLNetLMHeadModel
,
from
.modeling_xlnet
import
(
XLNetPreTrainedModel
,
XLNetModel
,
XLNetLMHeadModel
,
XLNetForSequenceClassification
,
XLNetForMultipleChoice
,
XLNetForSequenceClassification
,
XLNetForTokenClassification
,
XLNetForQuestionAnsweringSimple
,
XLNetForQuestionAnswering
,
XLNetForMultipleChoice
,
XLNetForQuestionAnsweringSimple
,
load_tf_weights_in_xlnet
,
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
)
XLNetForQuestionAnswering
,
load_tf_weights_in_xlnet
,
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_xlm
import
(
XLMPreTrainedModel
,
XLMModel
,
from
.modeling_xlm
import
(
XLMPreTrainedModel
,
XLMModel
,
XLMWithLMHeadModel
,
XLMForSequenceClassification
,
XLMWithLMHeadModel
,
XLMForSequenceClassification
,
XLMForQuestionAnswering
,
XLMForQuestionAnsweringSimple
,
XLMForQuestionAnswering
,
XLMForQuestionAnsweringSimple
,
...
@@ -92,22 +101,31 @@ if is_torch_available():
...
@@ -92,22 +101,31 @@ if is_torch_available():
RobertaForSequenceClassification
,
RobertaForMultipleChoice
,
RobertaForSequenceClassification
,
RobertaForMultipleChoice
,
RobertaForTokenClassification
,
RobertaForTokenClassification
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_distilbert
import
(
DistilBertForMaskedLM
,
DistilBertModel
,
from
.modeling_distilbert
import
(
DistilBertPreTrainedModel
,
DistilBertForMaskedLM
,
DistilBertModel
,
DistilBertForSequenceClassification
,
DistilBertForQuestionAnswering
,
DistilBertForSequenceClassification
,
DistilBertForQuestionAnswering
,
DistilBertForTokenClassification
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_camembert
import
(
CamembertForMaskedLM
,
CamembertModel
,
CamembertForSequenceClassification
,
CamembertForMultipleChoice
,
CamembertForTokenClassification
,
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_encoder_decoder
import
PreTrainedEncoderDecoder
,
Model2Model
from
.modeling_encoder_decoder
import
PreTrainedEncoderDecoder
,
Model2Model
from
.modeling_t5
import
(
T5PreTrainedModel
,
T5Model
,
T5WithLMHeadModel
,
from
.modeling_t5
import
(
T5PreTrainedModel
,
T5Model
,
T5WithLMHeadModel
,
load_tf_weights_in_t5
,
load_tf_weights_in_t5
,
T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_albert
import
(
AlbertPreTrainedModel
,
AlbertModel
,
AlbertForMaskedLM
,
AlbertForSequenceClassification
,
AlbertForQuestionAnswering
,
load_tf_weights_in_albert
,
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
# Optimization
# Optimization
from
.optimization
import
(
AdamW
,
C
onstant
LRS
chedule
,
WarmupC
onstant
S
chedule
,
WarmupC
osine
S
chedule
,
from
.optimization
import
(
AdamW
,
get_c
onstant
_s
chedule
,
get_c
onstant
_s
chedule
_with_warmup
,
get_c
osine
_s
chedule
_with_warmup
,
WarmupC
osine
W
ith
H
ard
R
estarts
S
chedule
,
WarmupL
inear
S
chedule
)
get_c
osine
_w
ith
_h
ard
_r
estarts
_s
chedule
_with_warmup
,
get_l
inear
_s
chedule
_with_warmup
)
# TensorFlow
# TensorFlow
if
is_tf_available
():
if
is_tf_available
():
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
from
.modeling_tf_auto
import
(
TFAutoModel
,
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
,
from
.modeling_tf_auto
import
(
TFAutoModel
,
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
,
TFAutoModelWithLMHead
)
TFAutoModelWithLMHead
)
...
@@ -133,6 +151,7 @@ if is_tf_available():
...
@@ -133,6 +151,7 @@ if is_tf_available():
from
.modeling_tf_xlnet
import
(
TFXLNetPreTrainedModel
,
TFXLNetMainLayer
,
from
.modeling_tf_xlnet
import
(
TFXLNetPreTrainedModel
,
TFXLNetMainLayer
,
TFXLNetModel
,
TFXLNetLMHeadModel
,
TFXLNetModel
,
TFXLNetLMHeadModel
,
TFXLNetForSequenceClassification
,
TFXLNetForSequenceClassification
,
TFXLNetForTokenClassification
,
TFXLNetForQuestionAnsweringSimple
,
TFXLNetForQuestionAnsweringSimple
,
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
)
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
)
...
@@ -151,6 +170,7 @@ if is_tf_available():
...
@@ -151,6 +170,7 @@ if is_tf_available():
from
.modeling_tf_distilbert
import
(
TFDistilBertPreTrainedModel
,
TFDistilBertMainLayer
,
from
.modeling_tf_distilbert
import
(
TFDistilBertPreTrainedModel
,
TFDistilBertMainLayer
,
TFDistilBertModel
,
TFDistilBertForMaskedLM
,
TFDistilBertModel
,
TFDistilBertForMaskedLM
,
TFDistilBertForSequenceClassification
,
TFDistilBertForSequenceClassification
,
TFDistilBertForTokenClassification
,
TFDistilBertForQuestionAnswering
,
TFDistilBertForQuestionAnswering
,
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
...
@@ -158,9 +178,16 @@ if is_tf_available():
...
@@ -158,9 +178,16 @@ if is_tf_available():
TFCTRLLMHeadModel
,
TFCTRLLMHeadModel
,
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
)
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_tf_albert
import
(
TFAlbertPreTrainedModel
,
TFAlbertModel
,
TFAlbertForMaskedLM
,
TFAlbertForSequenceClassification
,
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_tf_t5
import
(
TFT5PreTrainedModel
,
TFT5Model
,
TFT5WithLMHeadModel
,
from
.modeling_tf_t5
import
(
TFT5PreTrainedModel
,
TFT5Model
,
TFT5WithLMHeadModel
,
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
# Optimization
from
.optimization_tf
import
(
WarmUp
,
create_optimizer
,
AdamWeightDecay
,
GradientAccumulator
)
# TF 2.0 <=> PyTorch conversion utilities
# TF 2.0 <=> PyTorch conversion utilities
from
.modeling_tf_pytorch_utils
import
(
convert_tf_weight_name_to_pt_weight_name
,
from
.modeling_tf_pytorch_utils
import
(
convert_tf_weight_name_to_pt_weight_name
,
load_pytorch_checkpoint_in_tf2_model
,
load_pytorch_checkpoint_in_tf2_model
,
...
...
transformers/commands/__init__.py
0 → 100644
View file @
0558c9cb
from
abc
import
ABC
,
abstractmethod
from
argparse
import
ArgumentParser
class
BaseTransformersCLICommand
(
ABC
):
@
staticmethod
@
abstractmethod
def
register_subcommand
(
parser
:
ArgumentParser
):
raise
NotImplementedError
()
@
abstractmethod
def
run
(
self
):
raise
NotImplementedError
()
transformers/commands/user.py
0 → 100644
View file @
0558c9cb
from
argparse
import
ArgumentParser
from
getpass
import
getpass
import
os
from
transformers.commands
import
BaseTransformersCLICommand
from
transformers.hf_api
import
HfApi
,
HfFolder
,
HTTPError
class
UserCommands
(
BaseTransformersCLICommand
):
@
staticmethod
def
register_subcommand
(
parser
:
ArgumentParser
):
login_parser
=
parser
.
add_parser
(
'login'
)
login_parser
.
set_defaults
(
func
=
lambda
args
:
LoginCommand
(
args
))
whoami_parser
=
parser
.
add_parser
(
'whoami'
)
whoami_parser
.
set_defaults
(
func
=
lambda
args
:
WhoamiCommand
(
args
))
logout_parser
=
parser
.
add_parser
(
'logout'
)
logout_parser
.
set_defaults
(
func
=
lambda
args
:
LogoutCommand
(
args
))
list_parser
=
parser
.
add_parser
(
'ls'
)
list_parser
.
set_defaults
(
func
=
lambda
args
:
ListObjsCommand
(
args
))
# upload
upload_parser
=
parser
.
add_parser
(
'upload'
)
upload_parser
.
add_argument
(
'file'
,
type
=
str
,
help
=
'Local filepath of the file to upload.'
)
upload_parser
.
add_argument
(
'--filename'
,
type
=
str
,
default
=
None
,
help
=
'Optional: override object filename on S3.'
)
upload_parser
.
set_defaults
(
func
=
lambda
args
:
UploadCommand
(
args
))
class
ANSI
:
"""
Helper for en.wikipedia.org/wiki/ANSI_escape_code
"""
_bold
=
u
"
\u001b
[1m"
_reset
=
u
"
\u001b
[0m"
@
classmethod
def
bold
(
cls
,
s
):
return
"{}{}{}"
.
format
(
cls
.
_bold
,
s
,
cls
.
_reset
)
class
BaseUserCommand
:
def
__init__
(
self
,
args
):
self
.
args
=
args
self
.
_api
=
HfApi
()
class
LoginCommand
(
BaseUserCommand
):
def
run
(
self
):
print
(
"""
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
"""
)
username
=
input
(
"Username: "
)
password
=
getpass
()
try
:
token
=
self
.
_api
.
login
(
username
,
password
)
except
HTTPError
as
e
:
# probably invalid credentials, display error message.
print
(
e
)
exit
(
1
)
HfFolder
.
save_token
(
token
)
print
(
"Login successful"
)
print
(
"Your token:"
,
token
,
"
\n
"
)
print
(
"Your token has been saved to"
,
HfFolder
.
path_token
)
class
WhoamiCommand
(
BaseUserCommand
):
def
run
(
self
):
token
=
HfFolder
.
get_token
()
if
token
is
None
:
print
(
"Not logged in"
)
exit
()
try
:
user
=
self
.
_api
.
whoami
(
token
)
print
(
user
)
except
HTTPError
as
e
:
print
(
e
)
class
LogoutCommand
(
BaseUserCommand
):
def
run
(
self
):
token
=
HfFolder
.
get_token
()
if
token
is
None
:
print
(
"Not logged in"
)
exit
()
HfFolder
.
delete_token
()
self
.
_api
.
logout
(
token
)
print
(
"Successfully logged out."
)
class
ListObjsCommand
(
BaseUserCommand
):
def
tabulate
(
self
,
rows
,
headers
):
# type: (List[List[Union[str, int]]], List[str]) -> str
"""
Inspired by:
stackoverflow.com/a/8356620/593036
stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
"""
col_widths
=
[
max
(
len
(
str
(
x
))
for
x
in
col
)
for
col
in
zip
(
*
rows
,
headers
)]
row_format
=
(
"{{:{}}} "
*
len
(
headers
)).
format
(
*
col_widths
)
lines
=
[]
lines
.
append
(
row_format
.
format
(
*
headers
)
)
lines
.
append
(
row_format
.
format
(
*
[
"-"
*
w
for
w
in
col_widths
])
)
for
row
in
rows
:
lines
.
append
(
row_format
.
format
(
*
row
)
)
return
"
\n
"
.
join
(
lines
)
def
run
(
self
):
token
=
HfFolder
.
get_token
()
if
token
is
None
:
print
(
"Not logged in"
)
exit
(
1
)
try
:
objs
=
self
.
_api
.
list_objs
(
token
)
except
HTTPError
as
e
:
print
(
e
)
exit
(
1
)
if
len
(
objs
)
==
0
:
print
(
"No shared file yet"
)
exit
()
rows
=
[
[
obj
.
filename
,
obj
.
LastModified
,
obj
.
ETag
,
obj
.
Size
]
for
obj
in
objs
]
print
(
self
.
tabulate
(
rows
,
headers
=
[
"Filename"
,
"LastModified"
,
"ETag"
,
"Size"
])
)
class
UploadCommand
(
BaseUserCommand
):
def
run
(
self
):
token
=
HfFolder
.
get_token
()
if
token
is
None
:
print
(
"Not logged in"
)
exit
(
1
)
filepath
=
os
.
path
.
join
(
os
.
getcwd
(),
self
.
args
.
file
)
filename
=
self
.
args
.
filename
if
self
.
args
.
filename
is
not
None
else
os
.
path
.
basename
(
filepath
)
print
(
"About to upload file {} to S3 under filename {}"
.
format
(
ANSI
.
bold
(
filepath
),
ANSI
.
bold
(
filename
)
)
)
choice
=
input
(
"Proceed? [Y/n] "
).
lower
()
if
not
(
choice
==
""
or
choice
==
"y"
or
choice
==
"yes"
):
print
(
"Abort"
)
exit
()
print
(
ANSI
.
bold
(
"Uploading... This might take a while if file is large"
)
)
access_url
=
self
.
_api
.
presign_and_upload
(
token
=
token
,
filename
=
filename
,
filepath
=
filepath
)
print
(
"Your file now lives at:"
)
print
(
access_url
)
transformers/configuration_albert.py
0 → 100644
View file @
0558c9cb
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" ALBERT model configuration """
from
.configuration_utils
import
PretrainedConfig
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{
'albert-base-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json"
,
'albert-large-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json"
,
'albert-xlarge-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json"
,
'albert-xxlarge-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json"
,
'albert-base-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json"
,
'albert-large-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json"
,
'albert-xlarge-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json"
,
'albert-xxlarge-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json"
,
}
class
AlbertConfig
(
PretrainedConfig
):
"""Configuration for `AlbertModel`.
The default settings match the configuration of model `albert_xxlarge`.
"""
pretrained_config_archive_map
=
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def
__init__
(
self
,
vocab_size_or_config_json_file
=
30000
,
embedding_size
=
128
,
hidden_size
=
4096
,
num_hidden_layers
=
12
,
num_hidden_groups
=
1
,
num_attention_heads
=
64
,
intermediate_size
=
16384
,
inner_group_num
=
1
,
hidden_act
=
"gelu_new"
,
hidden_dropout_prob
=
0
,
attention_probs_dropout_prob
=
0
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
initializer_range
=
0.02
,
layer_norm_eps
=
1e-12
,
**
kwargs
):
"""Constructs AlbertConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
embedding_size: size of voc embeddings.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_hidden_groups: Number of group for the hidden layers, parameters in
the same group are shared.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
inner_group_num: int, number of inner repetition of attention and ffn.
down_scale_factor: float, the scale to apply
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`AlbertModel`.
initializer_range: The stdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super
(
AlbertConfig
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size_or_config_json_file
self
.
embedding_size
=
embedding_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_hidden_groups
=
num_hidden_groups
self
.
num_attention_heads
=
num_attention_heads
self
.
inner_group_num
=
inner_group_num
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
initializer_range
=
initializer_range
self
.
layer_norm_eps
=
layer_norm_eps
\ No newline at end of file
transformers/configuration_auto.py
View file @
0558c9cb
...
@@ -27,6 +27,8 @@ from .configuration_xlm import XLMConfig
...
@@ -27,6 +27,8 @@ from .configuration_xlm import XLMConfig
from
.configuration_roberta
import
RobertaConfig
from
.configuration_roberta
import
RobertaConfig
from
.configuration_distilbert
import
DistilBertConfig
from
.configuration_distilbert
import
DistilBertConfig
from
.configuration_ctrl
import
CTRLConfig
from
.configuration_ctrl
import
CTRLConfig
from
.configuration_camembert
import
CamembertConfig
from
.configuration_albert
import
AlbertConfig
from
.configuration_t5
import
T5Config
from
.configuration_t5
import
T5Config
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -44,13 +46,15 @@ class AutoConfig(object):
...
@@ -44,13 +46,15 @@ class AutoConfig(object):
The base model class to instantiate is selected as the first pattern matching
The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model)
- contains `roberta`: RobertaConfig (RoBERTa model)
- contains `bert`: BertConfig (Bert model)
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
- contains `xlm`: XLMConfig (XLM model)
- contains `roberta`: RobertaConfig (RoBERTa model)
- contains `ctrl` : CTRLConfig (CTRL model)
- contains `ctrl` : CTRLConfig (CTRL model)
This class cannot be instantiated using `__init__()` (throw an error).
This class cannot be instantiated using `__init__()` (throw an error).
"""
"""
...
@@ -67,13 +71,15 @@ class AutoConfig(object):
...
@@ -67,13 +71,15 @@ class AutoConfig(object):
in the `pretrained_model_name_or_path` string (in the following order):
in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Config (T5 model)
- contains `t5`: T5Config (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model)
- contains `roberta`: RobertaConfig (RoBERTa model)
- contains `bert`: BertConfig (Bert model)
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
- contains `xlm`: XLMConfig (XLM model)
- contains `roberta`: RobertaConfig (RoBERTa model)
- contains `ctrl` : CTRLConfig (CTRL model)
- contains `ctrl` : CTRLConfig (CTRL model)
Params:
Params:
pretrained_model_name_or_path: either:
pretrained_model_name_or_path: either:
...
@@ -94,6 +100,9 @@ class AutoConfig(object):
...
@@ -94,6 +100,9 @@ class AutoConfig(object):
force_download: (`optional`) boolean, default False:
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
The proxies are used on each request.
...
@@ -120,6 +129,10 @@ class AutoConfig(object):
...
@@ -120,6 +129,10 @@ class AutoConfig(object):
return
T5Config
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
return
T5Config
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
elif
'distilbert'
in
pretrained_model_name_or_path
:
elif
'distilbert'
in
pretrained_model_name_or_path
:
return
DistilBertConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
return
DistilBertConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
elif
'albert'
in
pretrained_model_name_or_path
:
return
AlbertConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
elif
'camembert'
in
pretrained_model_name_or_path
:
return
CamembertConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
return
RobertaConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
elif
'bert'
in
pretrained_model_name_or_path
:
...
@@ -138,4 +151,4 @@ class AutoConfig(object):
...
@@ -138,4 +151,4 @@ class AutoConfig(object):
return
CTRLConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
return
CTRLConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', '
ctrl
'"
.
format
(
pretrained_model_name_or_path
))
"'xlm', 'roberta', '
distilbert', 'camembert', 'ctrl', 'albert
'"
.
format
(
pretrained_model_name_or_path
))
transformers/configuration_camembert.py
0 → 100644
View file @
0558c9cb
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" CamemBERT configuration """
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
logging
from
.configuration_roberta
import
RobertaConfig
logger
=
logging
.
getLogger
(
__name__
)
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{
'camembert-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json"
,
}
class
CamembertConfig
(
RobertaConfig
):
pretrained_config_archive_map
=
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
transformers/configuration_distilbert.py
View file @
0558c9cb
...
@@ -27,7 +27,9 @@ logger = logging.getLogger(__name__)
...
@@ -27,7 +27,9 @@ logger = logging.getLogger(__name__)
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{
'distilbert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json"
,
'distilbert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json"
,
'distilbert-base-uncased-distilled-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
'distilbert-base-uncased-distilled-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
,
'distilbert-base-german-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json"
,
'distilbert-base-multilingual-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json"
,
}
}
...
...
transformers/configuration_gpt2.py
View file @
0558c9cb
...
@@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
...
@@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{
"gpt2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"
,
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{
"gpt2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"
,
"gpt2-medium"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"
,
"gpt2-medium"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"
,
"gpt2-large"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"
,
"gpt2-large"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"
,
"gpt2-xl"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json"
,
"distilgpt2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json"
,}
"distilgpt2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json"
,}
class
GPT2Config
(
PretrainedConfig
):
class
GPT2Config
(
PretrainedConfig
):
...
...
transformers/configuration_roberta.py
View file @
0558c9cb
...
@@ -29,6 +29,8 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
...
@@ -29,6 +29,8 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json"
,
'roberta-base-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json"
,
'roberta-large-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json"
,
}
}
...
...
transformers/configuration_utils.py
View file @
0558c9cb
...
@@ -94,6 +94,9 @@ class PretrainedConfig(object):
...
@@ -94,6 +94,9 @@ class PretrainedConfig(object):
force_download: (`optional`) boolean, default False:
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
The proxies are used on each request.
...
@@ -120,6 +123,7 @@ class PretrainedConfig(object):
...
@@ -120,6 +123,7 @@ class PretrainedConfig(object):
"""
"""
cache_dir
=
kwargs
.
pop
(
'cache_dir'
,
None
)
cache_dir
=
kwargs
.
pop
(
'cache_dir'
,
None
)
force_download
=
kwargs
.
pop
(
'force_download'
,
False
)
force_download
=
kwargs
.
pop
(
'force_download'
,
False
)
resume_download
=
kwargs
.
pop
(
'resume_download'
,
False
)
proxies
=
kwargs
.
pop
(
'proxies'
,
None
)
proxies
=
kwargs
.
pop
(
'proxies'
,
None
)
return_unused_kwargs
=
kwargs
.
pop
(
'return_unused_kwargs'
,
False
)
return_unused_kwargs
=
kwargs
.
pop
(
'return_unused_kwargs'
,
False
)
...
@@ -131,7 +135,8 @@ class PretrainedConfig(object):
...
@@ -131,7 +135,8 @@ class PretrainedConfig(object):
config_file
=
pretrained_model_name_or_path
config_file
=
pretrained_model_name_or_path
# redirect to the cache, if necessary
# redirect to the cache, if necessary
try
:
try
:
resolved_config_file
=
cached_path
(
config_file
,
cache_dir
=
cache_dir
,
force_download
=
force_download
,
proxies
=
proxies
)
resolved_config_file
=
cached_path
(
config_file
,
cache_dir
=
cache_dir
,
force_download
=
force_download
,
proxies
=
proxies
,
resume_download
=
resume_download
)
except
EnvironmentError
:
except
EnvironmentError
:
if
pretrained_model_name_or_path
in
cls
.
pretrained_config_archive_map
:
if
pretrained_model_name_or_path
in
cls
.
pretrained_config_archive_map
:
msg
=
"Couldn't reach server at '{}' to download pretrained model configuration file."
.
format
(
msg
=
"Couldn't reach server at '{}' to download pretrained model configuration file."
.
format
(
...
...
transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
0 → 100644
View file @
0558c9cb
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert ALBERT checkpoint."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
argparse
import
torch
from
transformers
import
AlbertConfig
,
AlbertForMaskedLM
,
load_tf_weights_in_albert
import
logging
logging
.
basicConfig
(
level
=
logging
.
INFO
)
def
convert_tf_checkpoint_to_pytorch
(
tf_checkpoint_path
,
albert_config_file
,
pytorch_dump_path
):
# Initialise PyTorch model
config
=
AlbertConfig
.
from_json_file
(
albert_config_file
)
print
(
"Building PyTorch model from configuration: {}"
.
format
(
str
(
config
)))
model
=
AlbertForMaskedLM
(
config
)
# Load weights from tf checkpoint
load_tf_weights_in_albert
(
model
,
config
,
tf_checkpoint_path
)
# Save pytorch-model
print
(
"Save PyTorch model to {}"
.
format
(
pytorch_dump_path
))
torch
.
save
(
model
.
state_dict
(),
pytorch_dump_path
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--tf_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the TensorFlow checkpoint path."
)
parser
.
add_argument
(
"--albert_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained ALBERT model.
\n
"
"This specifies the model architecture."
)
parser
.
add_argument
(
"--pytorch_dump_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
)
args
=
parser
.
parse_args
()
convert_tf_checkpoint_to_pytorch
(
args
.
tf_checkpoint_path
,
args
.
albert_config_file
,
args
.
pytorch_dump_path
)
\ No newline at end of file
transformers/convert_pytorch_checkpoint_to_tf2.py
View file @
0558c9cb
...
@@ -34,6 +34,7 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
...
@@ -34,6 +34,7 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
RobertaConfig
,
TFRobertaForMaskedLM
,
TFRobertaForSequenceClassification
,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
,
RobertaConfig
,
TFRobertaForMaskedLM
,
TFRobertaForSequenceClassification
,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
,
DistilBertConfig
,
TFDistilBertForMaskedLM
,
TFDistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
,
DistilBertConfig
,
TFDistilBertForMaskedLM
,
TFDistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
,
CTRLConfig
,
TFCTRLLMHeadModel
,
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
,
CTRLConfig
,
TFCTRLLMHeadModel
,
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
,
AlbertConfig
,
TFAlbertForMaskedLM
,
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
,
T5Config
,
TFT5WithLMHeadModel
,
T5_PRETRAINED_CONFIG_ARCHIVE_MAP
)
T5Config
,
TFT5WithLMHeadModel
,
T5_PRETRAINED_CONFIG_ARCHIVE_MAP
)
if
is_torch_available
():
if
is_torch_available
():
...
@@ -48,6 +49,7 @@ if is_torch_available():
...
@@ -48,6 +49,7 @@ if is_torch_available():
RobertaForMaskedLM
,
RobertaForSequenceClassification
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
CTRLLMHeadModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
,
CTRLLMHeadModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
,
AlbertForMaskedLM
,
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
T5WithLMHeadModel
,
T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
T5WithLMHeadModel
,
T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
else
:
(
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
(
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
...
@@ -59,6 +61,7 @@ else:
...
@@ -59,6 +61,7 @@ else:
RobertaForMaskedLM
,
RobertaForSequenceClassification
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
CTRLLMHeadModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
,
CTRLLMHeadModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
,
AlbertForMaskedLM
,
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
T5WithLMHeadModel
,
T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
=
(
T5WithLMHeadModel
,
T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
=
(
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
...
@@ -69,6 +72,7 @@ else:
...
@@ -69,6 +72,7 @@ else:
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)
None
,
None
)
...
@@ -90,6 +94,7 @@ MODEL_CLASSES = {
...
@@ -90,6 +94,7 @@ MODEL_CLASSES = {
'distilbert'
:
(
DistilBertConfig
,
TFDistilBertForMaskedLM
,
DistilBertForMaskedLM
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
),
'distilbert'
:
(
DistilBertConfig
,
TFDistilBertForMaskedLM
,
DistilBertForMaskedLM
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
),
'distilbert-base-uncased-distilled-squad'
:
(
DistilBertConfig
,
TFDistilBertForQuestionAnswering
,
DistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
),
'distilbert-base-uncased-distilled-squad'
:
(
DistilBertConfig
,
TFDistilBertForQuestionAnswering
,
DistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
),
'ctrl'
:
(
CTRLConfig
,
TFCTRLLMHeadModel
,
CTRLLMHeadModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
,
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
),
'ctrl'
:
(
CTRLConfig
,
TFCTRLLMHeadModel
,
CTRLLMHeadModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
,
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
),
'albert'
:
(
AlbertConfig
,
TFAlbertForMaskedLM
,
AlbertForMaskedLM
,
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
),
't5'
:
(
T5Config
,
TFT5WithLMHeadModel
,
T5WithLMHeadModel
,
T5_PRETRAINED_MODEL_ARCHIVE_MAP
,
T5_PRETRAINED_CONFIG_ARCHIVE_MAP
),
't5'
:
(
T5Config
,
TFT5WithLMHeadModel
,
T5WithLMHeadModel
,
T5_PRETRAINED_MODEL_ARCHIVE_MAP
,
T5_PRETRAINED_CONFIG_ARCHIVE_MAP
),
}
}
...
...
transformers/data/__init__.py
View file @
0558c9cb
from
.processors
import
InputExample
,
InputFeatures
,
DataProcessor
from
.processors
import
InputExample
,
InputFeatures
,
DataProcessor
,
SquadFeatures
from
.processors
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.processors
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.processors
import
squad_convert_examples_to_features
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
from
.processors
import
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
from
.metrics
import
is_sklearn_available
from
.metrics
import
is_sklearn_available
if
is_sklearn_available
():
if
is_sklearn_available
():
from
.metrics
import
glue_compute_metrics
from
.metrics
import
glue_compute_metrics
,
xnli_compute_metrics
transformers/data/metrics/__init__.py
View file @
0558c9cb
...
@@ -81,3 +81,11 @@ if _has_sklearn:
...
@@ -81,3 +81,11 @@ if _has_sklearn:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
else
:
else
:
raise
KeyError
(
task_name
)
raise
KeyError
(
task_name
)
def
xnli_compute_metrics
(
task_name
,
preds
,
labels
):
assert
len
(
preds
)
==
len
(
labels
)
if
task_name
==
"xnli"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
else
:
raise
KeyError
(
task_name
)
examples/utils_squad
.py
→
transformers/data/metrics/squad_metrics
.py
View file @
0558c9cb
""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Load SQuAD dataset. """
from
__future__
import
absolute_import
,
division
,
print_function
import
json
import
json
import
logging
import
logging
...
@@ -24,480 +14,371 @@ import math
...
@@ -24,480 +14,371 @@ import math
import
collections
import
collections
from
io
import
open
from
io
import
open
from
tqdm
import
tqdm
from
tqdm
import
tqdm
import
string
import
re
from
transformers.tokenization_bert
import
BasicTokenizer
,
whitespace_tokenize
from
transformers.tokenization_bert
import
BasicTokenizer
,
whitespace_tokenize
# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
from
utils_squad_evaluate
import
find_all_best_thresh_v2
,
make_qid_to_has_ans
,
get_raw_scores
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
class
SquadExample
(
object
):
def
normalize_answer
(
s
):
"""Lower text and remove punctuation, articles and extra whitespace."""
def
remove_articles
(
text
):
regex
=
re
.
compile
(
r
'\b(a|an|the)\b'
,
re
.
UNICODE
)
return
re
.
sub
(
regex
,
' '
,
text
)
def
white_space_fix
(
text
):
return
' '
.
join
(
text
.
split
())
def
remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
return
''
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
))))
def
get_tokens
(
s
):
if
not
s
:
return
[]
return
normalize_answer
(
s
).
split
()
def
compute_exact
(
a_gold
,
a_pred
):
return
int
(
normalize_answer
(
a_gold
)
==
normalize_answer
(
a_pred
))
def
compute_f1
(
a_gold
,
a_pred
):
gold_toks
=
get_tokens
(
a_gold
)
pred_toks
=
get_tokens
(
a_pred
)
common
=
collections
.
Counter
(
gold_toks
)
&
collections
.
Counter
(
pred_toks
)
num_same
=
sum
(
common
.
values
())
if
len
(
gold_toks
)
==
0
or
len
(
pred_toks
)
==
0
:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return
int
(
gold_toks
==
pred_toks
)
if
num_same
==
0
:
return
0
precision
=
1.0
*
num_same
/
len
(
pred_toks
)
recall
=
1.0
*
num_same
/
len
(
gold_toks
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
def
get_raw_scores
(
examples
,
preds
):
"""
"""
A single training/test example for the Squad dataset.
Computes the exact and f1 scores from the examples and the model predictions
For examples without an answer, the start and end position are -1.
"""
"""
exact_scores
=
{}
f1_scores
=
{}
for
example
in
examples
:
qas_id
=
example
.
qas_id
gold_answers
=
[
answer
[
'text'
]
for
answer
in
example
.
answers
if
normalize_answer
(
answer
[
'text'
])]
if
not
gold_answers
:
# For unanswerable questions, only correct answer is empty string
gold_answers
=
[
''
]
if
qas_id
not
in
preds
:
print
(
'Missing prediction for %s'
%
qas_id
)
continue
prediction
=
preds
[
qas_id
]
exact_scores
[
qas_id
]
=
max
(
compute_exact
(
a
,
prediction
)
for
a
in
gold_answers
)
f1_scores
[
qas_id
]
=
max
(
compute_f1
(
a
,
prediction
)
for
a
in
gold_answers
)
return
exact_scores
,
f1_scores
def
__init__
(
self
,
qas_id
,
def
apply_no_ans_threshold
(
scores
,
na_probs
,
qid_to_has_ans
,
na_prob_thresh
):
question_text
,
new_scores
=
{}
doc_tokens
,
for
qid
,
s
in
scores
.
items
():
orig_answer_text
=
None
,
pred_na
=
na_probs
[
qid
]
>
na_prob_thresh
start_position
=
None
,
if
pred_na
:
end_position
=
None
,
new_scores
[
qid
]
=
float
(
not
qid_to_has_ans
[
qid
])
is_impossible
=
None
):
else
:
self
.
qas_id
=
qas_id
new_scores
[
qid
]
=
s
self
.
question_text
=
question_text
return
new_scores
self
.
doc_tokens
=
doc_tokens
self
.
orig_answer_text
=
orig_answer_text
self
.
start_position
=
start_position
def
make_eval_dict
(
exact_scores
,
f1_scores
,
qid_list
=
None
):
self
.
end_position
=
end_position
if
not
qid_list
:
self
.
is_impossible
=
is_impossible
total
=
len
(
exact_scores
)
return
collections
.
OrderedDict
([
def
__str__
(
self
):
(
'exact'
,
100.0
*
sum
(
exact_scores
.
values
())
/
total
),
return
self
.
__repr__
()
(
'f1'
,
100.0
*
sum
(
f1_scores
.
values
())
/
total
),
(
'total'
,
total
),
def
__repr__
(
self
):
])
s
=
""
else
:
s
+=
"qas_id: %s"
%
(
self
.
qas_id
)
total
=
len
(
qid_list
)
s
+=
", question_text: %s"
%
(
return
collections
.
OrderedDict
([
self
.
question_text
)
(
'exact'
,
100.0
*
sum
(
exact_scores
[
k
]
for
k
in
qid_list
)
/
total
),
s
+=
", doc_tokens: [%s]"
%
(
" "
.
join
(
self
.
doc_tokens
))
(
'f1'
,
100.0
*
sum
(
f1_scores
[
k
]
for
k
in
qid_list
)
/
total
),
if
self
.
start_position
:
(
'total'
,
total
),
s
+=
", start_position: %d"
%
(
self
.
start_position
)
])
if
self
.
end_position
:
s
+=
", end_position: %d"
%
(
self
.
end_position
)
if
self
.
is_impossible
:
def
merge_eval
(
main_eval
,
new_eval
,
prefix
):
s
+=
", is_impossible: %r"
%
(
self
.
is_impossible
)
for
k
in
new_eval
:
return
s
main_eval
[
'%s_%s'
%
(
prefix
,
k
)]
=
new_eval
[
k
]
class
InputFeatures
(
object
):
def
find_best_thresh_v2
(
preds
,
scores
,
na_probs
,
qid_to_has_ans
):
"""A single set of features of data."""
num_no_ans
=
sum
(
1
for
k
in
qid_to_has_ans
if
not
qid_to_has_ans
[
k
])
cur_score
=
num_no_ans
def
__init__
(
self
,
best_score
=
cur_score
unique_id
,
best_thresh
=
0.0
example_index
,
qid_list
=
sorted
(
na_probs
,
key
=
lambda
k
:
na_probs
[
k
])
doc_span_index
,
for
i
,
qid
in
enumerate
(
qid_list
):
tokens
,
if
qid
not
in
scores
:
token_to_orig_map
,
continue
token_is_max_context
,
if
qid_to_has_ans
[
qid
]:
input_ids
,
diff
=
scores
[
qid
]
input_mask
,
else
:
segment_ids
,
if
preds
[
qid
]:
cls_index
,
diff
=
-
1
p_mask
,
paragraph_len
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
None
):
self
.
unique_id
=
unique_id
self
.
example_index
=
example_index
self
.
doc_span_index
=
doc_span_index
self
.
tokens
=
tokens
self
.
token_to_orig_map
=
token_to_orig_map
self
.
token_is_max_context
=
token_is_max_context
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
segment_ids
=
segment_ids
self
.
cls_index
=
cls_index
self
.
p_mask
=
p_mask
self
.
paragraph_len
=
paragraph_len
self
.
start_position
=
start_position
self
.
end_position
=
end_position
self
.
is_impossible
=
is_impossible
def
read_squad_examples
(
input_file
,
is_training
,
version_2_with_negative
):
"""Read a SQuAD json file into a list of SquadExample."""
with
open
(
input_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
input_data
=
json
.
load
(
reader
)[
"data"
]
def
is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
return
False
examples
=
[]
for
entry
in
input_data
:
for
paragraph
in
entry
[
"paragraphs"
]:
paragraph_text
=
paragraph
[
"context"
]
doc_tokens
=
[]
char_to_word_offset
=
[]
prev_is_whitespace
=
True
for
c
in
paragraph_text
:
if
is_whitespace
(
c
):
prev_is_whitespace
=
True
else
:
if
prev_is_whitespace
:
doc_tokens
.
append
(
c
)
else
:
doc_tokens
[
-
1
]
+=
c
prev_is_whitespace
=
False
char_to_word_offset
.
append
(
len
(
doc_tokens
)
-
1
)
for
qa
in
paragraph
[
"qas"
]:
qas_id
=
qa
[
"id"
]
question_text
=
qa
[
"question"
]
start_position
=
None
end_position
=
None
orig_answer_text
=
None
is_impossible
=
False
if
is_training
:
if
version_2_with_negative
:
is_impossible
=
qa
[
"is_impossible"
]
if
(
len
(
qa
[
"answers"
])
!=
1
)
and
(
not
is_impossible
):
raise
ValueError
(
"For training, each question should have exactly 1 answer."
)
if
not
is_impossible
:
answer
=
qa
[
"answers"
][
0
]
orig_answer_text
=
answer
[
"text"
]
answer_offset
=
answer
[
"answer_start"
]
answer_length
=
len
(
orig_answer_text
)
start_position
=
char_to_word_offset
[
answer_offset
]
end_position
=
char_to_word_offset
[
answer_offset
+
answer_length
-
1
]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
" "
.
join
(
whitespace_tokenize
(
orig_answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
logger
.
warning
(
"Could not find answer: '%s' vs. '%s'"
,
actual_text
,
cleaned_answer_text
)
continue
else
:
start_position
=
-
1
end_position
=
-
1
orig_answer_text
=
""
example
=
SquadExample
(
qas_id
=
qas_id
,
question_text
=
question_text
,
doc_tokens
=
doc_tokens
,
orig_answer_text
=
orig_answer_text
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
is_impossible
)
examples
.
append
(
example
)
return
examples
def
convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
doc_stride
,
max_query_length
,
is_training
,
cls_token_at_end
=
False
,
cls_token
=
'[CLS]'
,
sep_token
=
'[SEP]'
,
pad_token
=
0
,
sequence_a_segment_id
=
0
,
sequence_b_segment_id
=
1
,
cls_token_segment_id
=
0
,
pad_token_segment_id
=
0
,
mask_padding_with_zero
=
True
,
sequence_a_is_doc
=
False
):
"""Loads a data file into a list of `InputBatch`s."""
unique_id
=
1000000000
# cnt_pos, cnt_neg = 0, 0
# max_N, max_M = 1024, 1024
# f = np.zeros((max_N, max_M), dtype=np.float32)
features
=
[]
for
(
example_index
,
example
)
in
enumerate
(
tqdm
(
examples
)):
# if example_index % 100 == 0:
# logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
query_tokens
=
tokenizer
.
tokenize
(
example
.
question_text
)
if
len
(
query_tokens
)
>
max_query_length
:
query_tokens
=
query_tokens
[
0
:
max_query_length
]
tok_to_orig_index
=
[]
orig_to_tok_index
=
[]
all_doc_tokens
=
[]
for
(
i
,
token
)
in
enumerate
(
example
.
doc_tokens
):
orig_to_tok_index
.
append
(
len
(
all_doc_tokens
))
sub_tokens
=
tokenizer
.
tokenize
(
token
)
for
sub_token
in
sub_tokens
:
tok_to_orig_index
.
append
(
i
)
all_doc_tokens
.
append
(
sub_token
)
tok_start_position
=
None
tok_end_position
=
None
if
is_training
and
example
.
is_impossible
:
tok_start_position
=
-
1
tok_end_position
=
-
1
if
is_training
and
not
example
.
is_impossible
:
tok_start_position
=
orig_to_tok_index
[
example
.
start_position
]
if
example
.
end_position
<
len
(
example
.
doc_tokens
)
-
1
:
tok_end_position
=
orig_to_tok_index
[
example
.
end_position
+
1
]
-
1
else
:
else
:
tok_end_position
=
len
(
all_doc_tokens
)
-
1
diff
=
0
(
tok_start_position
,
tok_end_position
)
=
_improve_answer_span
(
cur_score
+=
diff
all_doc_tokens
,
tok_start_position
,
tok_end_position
,
tokenizer
,
if
cur_score
>
best_score
:
example
.
orig_answer_text
)
best_score
=
cur_score
best_thresh
=
na_probs
[
qid
]
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc
=
max_seq_length
-
len
(
query_tokens
)
-
3
has_ans_score
,
has_ans_cnt
=
0
,
0
for
qid
in
qid_list
:
# We can have documents that are longer than the maximum sequence length.
if
not
qid_to_has_ans
[
qid
]:
# To deal with this we do a sliding window approach, where we take chunks
continue
# of the up to our max length with a stride of `doc_stride`.
has_ans_cnt
+=
1
_DocSpan
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"DocSpan"
,
[
"start"
,
"length"
])
if
qid
not
in
scores
:
doc_spans
=
[]
continue
start_offset
=
0
has_ans_score
+=
scores
[
qid
]
while
start_offset
<
len
(
all_doc_tokens
):
length
=
len
(
all_doc_tokens
)
-
start_offset
return
100.0
*
best_score
/
len
(
scores
),
best_thresh
,
1.0
*
has_ans_score
/
has_ans_cnt
if
length
>
max_tokens_for_doc
:
length
=
max_tokens_for_doc
doc_spans
.
append
(
_DocSpan
(
start
=
start_offset
,
length
=
length
))
def
find_all_best_thresh_v2
(
main_eval
,
preds
,
exact_raw
,
f1_raw
,
na_probs
,
qid_to_has_ans
):
if
start_offset
+
length
==
len
(
all_doc_tokens
):
best_exact
,
exact_thresh
,
has_ans_exact
=
find_best_thresh_v2
(
break
preds
,
exact_raw
,
na_probs
,
qid_to_has_ans
)
start_offset
+=
min
(
length
,
doc_stride
)
best_f1
,
f1_thresh
,
has_ans_f1
=
find_best_thresh_v2
(
preds
,
f1_raw
,
na_probs
,
qid_to_has_ans
)
for
(
doc_span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
main_eval
[
'best_exact'
]
=
best_exact
tokens
=
[]
main_eval
[
'best_exact_thresh'
]
=
exact_thresh
token_to_orig_map
=
{}
main_eval
[
'best_f1'
]
=
best_f1
token_is_max_context
=
{}
main_eval
[
'best_f1_thresh'
]
=
f1_thresh
segment_ids
=
[]
main_eval
[
'has_ans_exact'
]
=
has_ans_exact
main_eval
[
'has_ans_f1'
]
=
has_ans_f1
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# Original TF implem also keep the classification token (set to 0) (not sure why...)
p_mask
=
[]
def
find_best_thresh
(
preds
,
scores
,
na_probs
,
qid_to_has_ans
):
num_no_ans
=
sum
(
1
for
k
in
qid_to_has_ans
if
not
qid_to_has_ans
[
k
])
# CLS token at the beginning
cur_score
=
num_no_ans
if
not
cls_token_at_end
:
best_score
=
cur_score
tokens
.
append
(
cls_token
)
best_thresh
=
0.0
segment_ids
.
append
(
cls_token_segment_id
)
qid_list
=
sorted
(
na_probs
,
key
=
lambda
k
:
na_probs
[
k
])
p_mask
.
append
(
0
)
for
_
,
qid
in
enumerate
(
qid_list
):
cls_index
=
0
if
qid
not
in
scores
:
continue
# XLNet: P SEP Q SEP CLS
if
qid_to_has_ans
[
qid
]:
# Others: CLS Q SEP P SEP
diff
=
scores
[
qid
]
if
not
sequence_a_is_doc
:
else
:
# Query
if
preds
[
qid
]:
tokens
+=
query_tokens
diff
=
-
1
segment_ids
+=
[
sequence_a_segment_id
]
*
len
(
query_tokens
)
else
:
p_mask
+=
[
1
]
*
len
(
query_tokens
)
diff
=
0
cur_score
+=
diff
# SEP token
if
cur_score
>
best_score
:
tokens
.
append
(
sep_token
)
best_score
=
cur_score
segment_ids
.
append
(
sequence_a_segment_id
)
best_thresh
=
na_probs
[
qid
]
p_mask
.
append
(
1
)
return
100.0
*
best_score
/
len
(
scores
),
best_thresh
# Paragraph
for
i
in
range
(
doc_span
.
length
):
def
find_all_best_thresh
(
main_eval
,
preds
,
exact_raw
,
f1_raw
,
na_probs
,
qid_to_has_ans
):
split_token_index
=
doc_span
.
start
+
i
best_exact
,
exact_thresh
=
find_best_thresh
(
preds
,
exact_raw
,
na_probs
,
qid_to_has_ans
)
token_to_orig_map
[
len
(
tokens
)]
=
tok_to_orig_index
[
split_token_index
]
best_f1
,
f1_thresh
=
find_best_thresh
(
preds
,
f1_raw
,
na_probs
,
qid_to_has_ans
)
is_max_context
=
_check_is_max_context
(
doc_spans
,
doc_span_index
,
main_eval
[
'best_exact'
]
=
best_exact
split_token_index
)
main_eval
[
'best_exact_thresh'
]
=
exact_thresh
token_is_max_context
[
len
(
tokens
)]
=
is_max_context
main_eval
[
'best_f1'
]
=
best_f1
tokens
.
append
(
all_doc_tokens
[
split_token_index
])
main_eval
[
'best_f1_thresh'
]
=
f1_thresh
if
not
sequence_a_is_doc
:
segment_ids
.
append
(
sequence_b_segment_id
)
else
:
def
squad_evaluate
(
examples
,
preds
,
no_answer_probs
=
None
,
no_answer_probability_threshold
=
1.0
):
segment_ids
.
append
(
sequence_a_segment_id
)
qas_id_to_has_answer
=
{
example
.
qas_id
:
bool
(
example
.
answers
)
for
example
in
examples
}
p_mask
.
append
(
0
)
has_answer_qids
=
[
qas_id
for
qas_id
,
has_answer
in
qas_id_to_has_answer
.
items
()
if
has_answer
]
paragraph_len
=
doc_span
.
length
no_answer_qids
=
[
qas_id
for
qas_id
,
has_answer
in
qas_id_to_has_answer
.
items
()
if
not
has_answer
]
if
sequence_a_is_doc
:
if
no_answer_probs
is
None
:
# SEP token
no_answer_probs
=
{
k
:
0.0
for
k
in
preds
}
tokens
.
append
(
sep_token
)
segment_ids
.
append
(
sequence_a_segment_id
)
exact
,
f1
=
get_raw_scores
(
examples
,
preds
)
p_mask
.
append
(
1
)
exact_threshold
=
apply_no_ans_threshold
(
exact
,
no_answer_probs
,
qas_id_to_has_answer
,
no_answer_probability_threshold
)
tokens
+=
query_tokens
f1_threshold
=
apply_no_ans_threshold
(
f1
,
no_answer_probs
,
qas_id_to_has_answer
,
no_answer_probability_threshold
)
segment_ids
+=
[
sequence_b_segment_id
]
*
len
(
query_tokens
)
p_mask
+=
[
1
]
*
len
(
query_tokens
)
evaluation
=
make_eval_dict
(
exact_threshold
,
f1_threshold
)
# SEP token
if
has_answer_qids
:
tokens
.
append
(
sep_token
)
has_ans_eval
=
make_eval_dict
(
exact_threshold
,
f1_threshold
,
qid_list
=
has_answer_qids
)
segment_ids
.
append
(
sequence_b_segment_id
)
merge_eval
(
evaluation
,
has_ans_eval
,
'HasAns'
)
p_mask
.
append
(
1
)
if
no_answer_qids
:
# CLS token at the end
no_ans_eval
=
make_eval_dict
(
exact_threshold
,
f1_threshold
,
qid_list
=
no_answer_qids
)
if
cls_token_at_end
:
merge_eval
(
evaluation
,
no_ans_eval
,
'NoAns'
)
tokens
.
append
(
cls_token
)
segment_ids
.
append
(
cls_token_segment_id
)
if
no_answer_probs
:
p_mask
.
append
(
0
)
find_all_best_thresh
(
evaluation
,
preds
,
exact
,
f1
,
no_answer_probs
,
qas_id_to_has_answer
)
cls_index
=
len
(
tokens
)
-
1
# Index of classification token
return
evaluation
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
def
get_final_text
(
pred_text
,
orig_text
,
do_lower_case
,
verbose_logging
=
False
):
# tokens are attended to.
"""Project the tokenized prediction back to the original text."""
input_mask
=
[
1
if
mask_padding_with_zero
else
0
]
*
len
(
input_ids
)
# When we created the data, we kept track of the alignment between original
# Zero-pad up to the sequence length.
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
while
len
(
input_ids
)
<
max_seq_length
:
# now `orig_text` contains the span of our original text corresponding to the
input_ids
.
append
(
pad_token
)
# span that we predicted.
input_mask
.
append
(
0
if
mask_padding_with_zero
else
1
)
segment_ids
.
append
(
pad_token_segment_id
)
p_mask
.
append
(
1
)
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
span_is_impossible
=
example
.
is_impossible
start_position
=
None
end_position
=
None
if
is_training
and
not
span_is_impossible
:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start
=
doc_span
.
start
doc_end
=
doc_span
.
start
+
doc_span
.
length
-
1
out_of_span
=
False
if
not
(
tok_start_position
>=
doc_start
and
tok_end_position
<=
doc_end
):
out_of_span
=
True
if
out_of_span
:
start_position
=
0
end_position
=
0
span_is_impossible
=
True
else
:
if
sequence_a_is_doc
:
doc_offset
=
0
else
:
doc_offset
=
len
(
query_tokens
)
+
2
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
if
is_training
and
span_is_impossible
:
start_position
=
cls_index
end_position
=
cls_index
if
example_index
<
20
:
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"unique_id: %s"
%
(
unique_id
))
logger
.
info
(
"example_index: %s"
%
(
example_index
))
logger
.
info
(
"doc_span_index: %s"
%
(
doc_span_index
))
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
tokens
))
logger
.
info
(
"token_to_orig_map: %s"
%
" "
.
join
([
"%d:%d"
%
(
x
,
y
)
for
(
x
,
y
)
in
token_to_orig_map
.
items
()]))
logger
.
info
(
"token_is_max_context: %s"
%
" "
.
join
([
"%d:%s"
%
(
x
,
y
)
for
(
x
,
y
)
in
token_is_max_context
.
items
()
]))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logger
.
info
(
"segment_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
if
is_training
and
span_is_impossible
:
logger
.
info
(
"impossible example"
)
if
is_training
and
not
span_is_impossible
:
answer_text
=
" "
.
join
(
tokens
[
start_position
:(
end_position
+
1
)])
logger
.
info
(
"start_position: %d"
%
(
start_position
))
logger
.
info
(
"end_position: %d"
%
(
end_position
))
logger
.
info
(
"answer: %s"
%
(
answer_text
))
features
.
append
(
InputFeatures
(
unique_id
=
unique_id
,
example_index
=
example_index
,
doc_span_index
=
doc_span_index
,
tokens
=
tokens
,
token_to_orig_map
=
token_to_orig_map
,
token_is_max_context
=
token_is_max_context
,
input_ids
=
input_ids
,
input_mask
=
input_mask
,
segment_ids
=
segment_ids
,
cls_index
=
cls_index
,
p_mask
=
p_mask
,
paragraph_len
=
paragraph_len
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
span_is_impossible
))
unique_id
+=
1
return
features
def
_improve_answer_span
(
doc_tokens
,
input_start
,
input_end
,
tokenizer
,
orig_answer_text
):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# However, `orig_text` may contain extra characters that we don't want in
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# our prediction.
# the exact answer, 1895.
#
#
# However, this is not always possible. Consider the following:
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
#
# Question: What country is the top exporter of electornics?
# We don't want to return `orig_text` because it contains the extra "'s".
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
#
# In this case, the annotator chose "Japan" as a character sub-span of
# We don't want to return `pred_text` because it's already been normalized
# the word "Japanese". Since our WordPiece tokenizer does not split
# (the SQuAD eval script also does punctuation stripping/lower casing but
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# our tokenizer does additional normalization like stripping accent
# in SQuAD, but does happen.
# characters).
tok_answer_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_answer_text
))
for
new_start
in
range
(
input_start
,
input_end
+
1
):
for
new_end
in
range
(
input_end
,
new_start
-
1
,
-
1
):
text_span
=
" "
.
join
(
doc_tokens
[
new_start
:(
new_end
+
1
)])
if
text_span
==
tok_answer_text
:
return
(
new_start
,
new_end
)
return
(
input_start
,
input_end
)
def
_check_is_max_context
(
doc_spans
,
cur_span_index
,
position
):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
#
# Now the word 'bought' will have two scores from spans B and C. We only
# What we really want to return is "Steve Smith".
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
#
# In the example the maximum context for 'bought' would be span C since
# Therefore, we have to apply a semi-complicated alignment heuristic between
# it has 1 left context and 3 right context, while span B has 4 left context
# `pred_text` and `orig_text` to get a character-to-character alignment. This
# and 0 right context.
# can fail in certain cases in which case we just return `orig_text`.
best_score
=
None
best_span_index
=
None
def
_strip_spaces
(
text
):
for
(
span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
ns_chars
=
[]
end
=
doc_span
.
start
+
doc_span
.
length
-
1
ns_to_s_map
=
collections
.
OrderedDict
()
if
position
<
doc_span
.
start
:
for
(
i
,
c
)
in
enumerate
(
text
):
continue
if
c
==
" "
:
if
position
>
end
:
continue
continue
ns_to_s_map
[
len
(
ns_chars
)]
=
i
num_left_context
=
position
-
doc_span
.
start
ns_chars
.
append
(
c
)
num_right_context
=
end
-
position
ns_text
=
""
.
join
(
ns_chars
)
score
=
min
(
num_left_context
,
num_right_context
)
+
0.01
*
doc_span
.
length
return
(
ns_text
,
ns_to_s_map
)
if
best_score
is
None
or
score
>
best_score
:
best_score
=
score
# We first tokenize `orig_text`, strip whitespace from the result
best_span_index
=
span_index
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
tok_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_text
))
start_position
=
tok_text
.
find
(
pred_text
)
if
start_position
==
-
1
:
if
verbose_logging
:
logger
.
info
(
"Unable to find text: '%s' in '%s'"
%
(
pred_text
,
orig_text
))
return
orig_text
end_position
=
start_position
+
len
(
pred_text
)
-
1
(
orig_ns_text
,
orig_ns_to_s_map
)
=
_strip_spaces
(
orig_text
)
(
tok_ns_text
,
tok_ns_to_s_map
)
=
_strip_spaces
(
tok_text
)
if
len
(
orig_ns_text
)
!=
len
(
tok_ns_text
):
if
verbose_logging
:
logger
.
info
(
"Length not equal after stripping spaces: '%s' vs '%s'"
,
orig_ns_text
,
tok_ns_text
)
return
orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map
=
{}
for
(
i
,
tok_index
)
in
tok_ns_to_s_map
.
items
():
tok_s_to_ns_map
[
tok_index
]
=
i
orig_start_position
=
None
if
start_position
in
tok_s_to_ns_map
:
ns_start_position
=
tok_s_to_ns_map
[
start_position
]
if
ns_start_position
in
orig_ns_to_s_map
:
orig_start_position
=
orig_ns_to_s_map
[
ns_start_position
]
if
orig_start_position
is
None
:
if
verbose_logging
:
logger
.
info
(
"Couldn't map start position"
)
return
orig_text
orig_end_position
=
None
if
end_position
in
tok_s_to_ns_map
:
ns_end_position
=
tok_s_to_ns_map
[
end_position
]
if
ns_end_position
in
orig_ns_to_s_map
:
orig_end_position
=
orig_ns_to_s_map
[
ns_end_position
]
if
orig_end_position
is
None
:
if
verbose_logging
:
logger
.
info
(
"Couldn't map end position"
)
return
orig_text
output_text
=
orig_text
[
orig_start_position
:(
orig_end_position
+
1
)]
return
output_text
def
_get_best_indexes
(
logits
,
n_best_size
):
"""Get the n-best logits from a list."""
index_and_score
=
sorted
(
enumerate
(
logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
best_indexes
=
[]
for
i
in
range
(
len
(
index_and_score
)):
if
i
>=
n_best_size
:
break
best_indexes
.
append
(
index_and_score
[
i
][
0
])
return
best_indexes
def
_compute_softmax
(
scores
):
"""Compute softmax probability over raw logits."""
if
not
scores
:
return
[]
max_score
=
None
for
score
in
scores
:
if
max_score
is
None
or
score
>
max_score
:
max_score
=
score
return
cur_span_index
==
best_span_index
exp_scores
=
[]
total_sum
=
0.0
for
score
in
scores
:
x
=
math
.
exp
(
score
-
max_score
)
exp_scores
.
append
(
x
)
total_sum
+=
x
probs
=
[]
for
score
in
exp_scores
:
probs
.
append
(
score
/
total_sum
)
return
probs
RawResult
=
collections
.
namedtuple
(
"RawResult"
,
[
"unique_id"
,
"start_logits"
,
"end_logits"
])
def
write_predictions
(
all_examples
,
all_features
,
all_results
,
n_best_size
,
def
compute_predictions_logits
(
max_answer_length
,
do_lower_case
,
output_prediction_file
,
all_examples
,
output_nbest_file
,
output_null_log_odds_file
,
verbose_logging
,
all_features
,
version_2_with_negative
,
null_score_diff_threshold
):
all_results
,
n_best_size
,
max_answer_length
,
do_lower_case
,
output_prediction_file
,
output_nbest_file
,
output_null_log_odds_file
,
verbose_logging
,
version_2_with_negative
,
null_score_diff_threshold
):
"""Write final predictions to the json file and log-odds of null if needed."""
"""Write final predictions to the json file and log-odds of null if needed."""
logger
.
info
(
"Writing predictions to: %s"
%
(
output_prediction_file
))
logger
.
info
(
"Writing predictions to: %s"
%
(
output_prediction_file
))
logger
.
info
(
"Writing nbest to: %s"
%
(
output_nbest_file
))
logger
.
info
(
"Writing nbest to: %s"
%
(
output_nbest_file
))
...
@@ -626,12 +507,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
...
@@ -626,12 +507,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
text
=
""
,
text
=
""
,
start_logit
=
null_start_logit
,
start_logit
=
null_start_logit
,
end_logit
=
null_end_logit
))
end_logit
=
null_end_logit
))
# In very rare edge cases we could only have single null prediction.
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
# So we just create a nonce prediction in this case to avoid failure.
if
len
(
nbest
)
==
1
:
if
len
(
nbest
)
==
1
:
nbest
.
insert
(
0
,
nbest
.
insert
(
0
,
_NbestPrediction
(
text
=
"empty"
,
start_logit
=
0.0
,
end_logit
=
0.0
))
_NbestPrediction
(
text
=
"empty"
,
start_logit
=
0.0
,
end_logit
=
0.0
))
# In very rare edge cases we could have no valid predictions. So we
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
# just create a nonce prediction in this case to avoid failure.
...
@@ -688,18 +569,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
...
@@ -688,18 +569,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
return
all_predictions
return
all_predictions
# For XLNet (and XLM which uses the same head)
def
compute_predictions_log_probs
(
RawResultExtended
=
collections
.
namedtuple
(
"RawResultExtended"
,
all_examples
,
[
"unique_id"
,
"start_top_log_probs"
,
"start_top_index"
,
all_features
,
"end_top_log_probs"
,
"end_top_index"
,
"cls_logits"
])
all_results
,
n_best_size
,
max_answer_length
,
def
write_predictions_extended
(
all_examples
,
all_features
,
all_results
,
n_best_size
,
output_prediction_file
,
max_answer_length
,
output_prediction_file
,
output_nbest_file
,
output_nbest_file
,
output_null_log_odds_file
,
output_null_log_odds_file
,
orig_data_file
,
start_n_top
,
start_n_top
,
end_n_top
,
version_2_with_negative
,
end_n_top
,
tokenizer
,
verbose_logging
):
version_2_with_negative
,
tokenizer
,
verbose_logging
):
""" XLNet write prediction logic (more complex than Bert's).
""" XLNet write prediction logic (more complex than Bert's).
Write final predictions to the json file and log-odds of null if needed.
Write final predictions to the json file and log-odds of null if needed.
...
@@ -708,7 +592,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
...
@@ -708,7 +592,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
_PrelimPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
_PrelimPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"PrelimPrediction"
,
"PrelimPrediction"
,
[
"feature_index"
,
"start_index"
,
"end_index"
,
[
"feature_index"
,
"start_index"
,
"end_index"
,
"start_log_prob"
,
"end_log_prob"
])
"start_log_prob"
,
"end_log_prob"
])
_NbestPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
_NbestPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"NbestPrediction"
,
[
"text"
,
"start_log_prob"
,
"end_log_prob"
])
"NbestPrediction"
,
[
"text"
,
"start_log_prob"
,
"end_log_prob"
])
...
@@ -745,12 +629,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
...
@@ -745,12 +629,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
for
i
in
range
(
start_n_top
):
for
i
in
range
(
start_n_top
):
for
j
in
range
(
end_n_top
):
for
j
in
range
(
end_n_top
):
start_log_prob
=
result
.
start_
top_log_prob
s
[
i
]
start_log_prob
=
result
.
start_
logit
s
[
i
]
start_index
=
result
.
start_top_index
[
i
]
start_index
=
result
.
start_top_index
[
i
]
j_index
=
i
*
end_n_top
+
j
j_index
=
i
*
end_n_top
+
j
end_log_prob
=
result
.
end_
top_log_prob
s
[
j_index
]
end_log_prob
=
result
.
end_
logit
s
[
j_index
]
end_index
=
result
.
end_top_index
[
j_index
]
end_index
=
result
.
end_top_index
[
j_index
]
# We could hypothetically create invalid predictions, e.g., predict
# We could hypothetically create invalid predictions, e.g., predict
...
@@ -791,7 +675,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
...
@@ -791,7 +675,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
# XLNet un-tokenizer
# XLNet un-tokenizer
# Let's keep it simple for now and see if we need all this later.
# Let's keep it simple for now and see if we need all this later.
#
#
# tok_start_to_orig_index = feature.tok_start_to_orig_index
# tok_start_to_orig_index = feature.tok_start_to_orig_index
# tok_end_to_orig_index = feature.tok_end_to_orig_index
# tok_end_to_orig_index = feature.tok_end_to_orig_index
# start_orig_pos = tok_start_to_orig_index[pred.start_index]
# start_orig_pos = tok_start_to_orig_index[pred.start_index]
...
@@ -871,146 +755,4 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
...
@@ -871,146 +755,4 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
with
open
(
output_null_log_odds_file
,
"w"
)
as
writer
:
with
open
(
output_null_log_odds_file
,
"w"
)
as
writer
:
writer
.
write
(
json
.
dumps
(
scores_diff_json
,
indent
=
4
)
+
"
\n
"
)
writer
.
write
(
json
.
dumps
(
scores_diff_json
,
indent
=
4
)
+
"
\n
"
)
with
open
(
orig_data_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
return
all_predictions
orig_data
=
json
.
load
(
reader
)[
"data"
]
qid_to_has_ans
=
make_qid_to_has_ans
(
orig_data
)
has_ans_qids
=
[
k
for
k
,
v
in
qid_to_has_ans
.
items
()
if
v
]
no_ans_qids
=
[
k
for
k
,
v
in
qid_to_has_ans
.
items
()
if
not
v
]
exact_raw
,
f1_raw
=
get_raw_scores
(
orig_data
,
all_predictions
)
out_eval
=
{}
find_all_best_thresh_v2
(
out_eval
,
all_predictions
,
exact_raw
,
f1_raw
,
scores_diff_json
,
qid_to_has_ans
)
return
out_eval
def
get_final_text
(
pred_text
,
orig_text
,
do_lower_case
,
verbose_logging
=
False
):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heuristic between
# `pred_text` and `orig_text` to get a character-to-character alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def
_strip_spaces
(
text
):
ns_chars
=
[]
ns_to_s_map
=
collections
.
OrderedDict
()
for
(
i
,
c
)
in
enumerate
(
text
):
if
c
==
" "
:
continue
ns_to_s_map
[
len
(
ns_chars
)]
=
i
ns_chars
.
append
(
c
)
ns_text
=
""
.
join
(
ns_chars
)
return
(
ns_text
,
ns_to_s_map
)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
tok_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_text
))
start_position
=
tok_text
.
find
(
pred_text
)
if
start_position
==
-
1
:
if
verbose_logging
:
logger
.
info
(
"Unable to find text: '%s' in '%s'"
%
(
pred_text
,
orig_text
))
return
orig_text
end_position
=
start_position
+
len
(
pred_text
)
-
1
(
orig_ns_text
,
orig_ns_to_s_map
)
=
_strip_spaces
(
orig_text
)
(
tok_ns_text
,
tok_ns_to_s_map
)
=
_strip_spaces
(
tok_text
)
if
len
(
orig_ns_text
)
!=
len
(
tok_ns_text
):
if
verbose_logging
:
logger
.
info
(
"Length not equal after stripping spaces: '%s' vs '%s'"
,
orig_ns_text
,
tok_ns_text
)
return
orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map
=
{}
for
(
i
,
tok_index
)
in
tok_ns_to_s_map
.
items
():
tok_s_to_ns_map
[
tok_index
]
=
i
orig_start_position
=
None
if
start_position
in
tok_s_to_ns_map
:
ns_start_position
=
tok_s_to_ns_map
[
start_position
]
if
ns_start_position
in
orig_ns_to_s_map
:
orig_start_position
=
orig_ns_to_s_map
[
ns_start_position
]
if
orig_start_position
is
None
:
if
verbose_logging
:
logger
.
info
(
"Couldn't map start position"
)
return
orig_text
orig_end_position
=
None
if
end_position
in
tok_s_to_ns_map
:
ns_end_position
=
tok_s_to_ns_map
[
end_position
]
if
ns_end_position
in
orig_ns_to_s_map
:
orig_end_position
=
orig_ns_to_s_map
[
ns_end_position
]
if
orig_end_position
is
None
:
if
verbose_logging
:
logger
.
info
(
"Couldn't map end position"
)
return
orig_text
output_text
=
orig_text
[
orig_start_position
:(
orig_end_position
+
1
)]
return
output_text
def
_get_best_indexes
(
logits
,
n_best_size
):
"""Get the n-best logits from a list."""
index_and_score
=
sorted
(
enumerate
(
logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
best_indexes
=
[]
for
i
in
range
(
len
(
index_and_score
)):
if
i
>=
n_best_size
:
break
best_indexes
.
append
(
index_and_score
[
i
][
0
])
return
best_indexes
def
_compute_softmax
(
scores
):
"""Compute softmax probability over raw logits."""
if
not
scores
:
return
[]
max_score
=
None
for
score
in
scores
:
if
max_score
is
None
or
score
>
max_score
:
max_score
=
score
exp_scores
=
[]
total_sum
=
0.0
for
score
in
scores
:
x
=
math
.
exp
(
score
-
max_score
)
exp_scores
.
append
(
x
)
total_sum
+=
x
probs
=
[]
for
score
in
exp_scores
:
probs
.
append
(
score
/
total_sum
)
return
probs
transformers/data/processors/__init__.py
View file @
0558c9cb
from
.utils
import
InputExample
,
InputFeatures
,
DataProcessor
from
.utils
import
InputExample
,
InputFeatures
,
DataProcessor
from
.glue
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.glue
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.squad
import
squad_convert_examples_to_features
,
SquadFeatures
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
from
.xnli
import
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
\ No newline at end of file
transformers/data/processors/squad.py
0 → 100644
View file @
0558c9cb
from
tqdm
import
tqdm
import
collections
import
logging
import
os
import
json
import
numpy
as
np
from
...tokenization_bert
import
BasicTokenizer
,
whitespace_tokenize
from
.utils
import
DataProcessor
,
InputExample
,
InputFeatures
from
...file_utils
import
is_tf_available
,
is_torch_available
if
is_torch_available
():
import
torch
from
torch.utils.data
import
TensorDataset
if
is_tf_available
():
import
tensorflow
as
tf
logger
=
logging
.
getLogger
(
__name__
)
def
_improve_answer_span
(
doc_tokens
,
input_start
,
input_end
,
tokenizer
,
orig_answer_text
):
"""Returns tokenized answer spans that better match the annotated answer."""
tok_answer_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_answer_text
))
for
new_start
in
range
(
input_start
,
input_end
+
1
):
for
new_end
in
range
(
input_end
,
new_start
-
1
,
-
1
):
text_span
=
" "
.
join
(
doc_tokens
[
new_start
:(
new_end
+
1
)])
if
text_span
==
tok_answer_text
:
return
(
new_start
,
new_end
)
return
(
input_start
,
input_end
)
def
_check_is_max_context
(
doc_spans
,
cur_span_index
,
position
):
"""Check if this is the 'max context' doc span for the token."""
best_score
=
None
best_span_index
=
None
for
(
span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
end
=
doc_span
.
start
+
doc_span
.
length
-
1
if
position
<
doc_span
.
start
:
continue
if
position
>
end
:
continue
num_left_context
=
position
-
doc_span
.
start
num_right_context
=
end
-
position
score
=
min
(
num_left_context
,
num_right_context
)
+
0.01
*
doc_span
.
length
if
best_score
is
None
or
score
>
best_score
:
best_score
=
score
best_span_index
=
span_index
return
cur_span_index
==
best_span_index
def
_new_check_is_max_context
(
doc_spans
,
cur_span_index
,
position
):
"""Check if this is the 'max context' doc span for the token."""
# if len(doc_spans) == 1:
# return True
best_score
=
None
best_span_index
=
None
for
(
span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
end
=
doc_span
[
"start"
]
+
doc_span
[
"length"
]
-
1
if
position
<
doc_span
[
"start"
]:
continue
if
position
>
end
:
continue
num_left_context
=
position
-
doc_span
[
"start"
]
num_right_context
=
end
-
position
score
=
min
(
num_left_context
,
num_right_context
)
+
0.01
*
doc_span
[
"length"
]
if
best_score
is
None
or
score
>
best_score
:
best_score
=
score
best_span_index
=
span_index
return
cur_span_index
==
best_span_index
def
_is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
return
False
def
squad_convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
doc_stride
,
max_query_length
,
is_training
,
return_dataset
=
False
):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query.
is_training: whether to create features for model evaluation or model training.
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
Returns:
list of :class:`~transformers.data.processors.squad.SquadFeatures`
Example::
processor = SquadV2Processor()
examples = processor.get_dev_examples(data_dir)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=not evaluate,
)
"""
# Defining helper methods
unique_id
=
1000000000
features
=
[]
for
(
example_index
,
example
)
in
enumerate
(
tqdm
(
examples
)):
if
is_training
and
not
example
.
is_impossible
:
# Get start and end position
start_position
=
example
.
start_position
end_position
=
example
.
end_position
# If the answer cannot be found in the text, then skip this example.
actual_text
=
" "
.
join
(
example
.
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
" "
.
join
(
whitespace_tokenize
(
example
.
answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
logger
.
warning
(
"Could not find answer: '%s' vs. '%s'"
,
actual_text
,
cleaned_answer_text
)
continue
tok_to_orig_index
=
[]
orig_to_tok_index
=
[]
all_doc_tokens
=
[]
for
(
i
,
token
)
in
enumerate
(
example
.
doc_tokens
):
orig_to_tok_index
.
append
(
len
(
all_doc_tokens
))
sub_tokens
=
tokenizer
.
tokenize
(
token
)
for
sub_token
in
sub_tokens
:
tok_to_orig_index
.
append
(
i
)
all_doc_tokens
.
append
(
sub_token
)
if
is_training
and
not
example
.
is_impossible
:
tok_start_position
=
orig_to_tok_index
[
example
.
start_position
]
if
example
.
end_position
<
len
(
example
.
doc_tokens
)
-
1
:
tok_end_position
=
orig_to_tok_index
[
example
.
end_position
+
1
]
-
1
else
:
tok_end_position
=
len
(
all_doc_tokens
)
-
1
(
tok_start_position
,
tok_end_position
)
=
_improve_answer_span
(
all_doc_tokens
,
tok_start_position
,
tok_end_position
,
tokenizer
,
example
.
answer_text
)
spans
=
[]
truncated_query
=
tokenizer
.
encode
(
example
.
question_text
,
add_special_tokens
=
False
,
max_length
=
max_query_length
)
sequence_added_tokens
=
tokenizer
.
max_len
-
tokenizer
.
max_len_single_sentence
sequence_pair_added_tokens
=
tokenizer
.
max_len
-
tokenizer
.
max_len_sentences_pair
span_doc_tokens
=
all_doc_tokens
while
len
(
spans
)
*
doc_stride
<
len
(
all_doc_tokens
):
encoded_dict
=
tokenizer
.
encode_plus
(
truncated_query
if
tokenizer
.
padding_side
==
"right"
else
span_doc_tokens
,
span_doc_tokens
if
tokenizer
.
padding_side
==
"right"
else
truncated_query
,
max_length
=
max_seq_length
,
return_overflowing_tokens
=
True
,
pad_to_max_length
=
True
,
stride
=
max_seq_length
-
doc_stride
-
len
(
truncated_query
)
-
sequence_pair_added_tokens
,
truncation_strategy
=
'only_second'
if
tokenizer
.
padding_side
==
"right"
else
'only_first'
)
paragraph_len
=
min
(
len
(
all_doc_tokens
)
-
len
(
spans
)
*
doc_stride
,
max_seq_length
-
len
(
truncated_query
)
-
sequence_pair_added_tokens
)
if
tokenizer
.
pad_token_id
in
encoded_dict
[
'input_ids'
]:
non_padded_ids
=
encoded_dict
[
'input_ids'
][:
encoded_dict
[
'input_ids'
].
index
(
tokenizer
.
pad_token_id
)]
else
:
non_padded_ids
=
encoded_dict
[
'input_ids'
]
tokens
=
tokenizer
.
convert_ids_to_tokens
(
non_padded_ids
)
token_to_orig_map
=
{}
for
i
in
range
(
paragraph_len
):
index
=
len
(
truncated_query
)
+
sequence_added_tokens
+
i
if
tokenizer
.
padding_side
==
"right"
else
i
token_to_orig_map
[
index
]
=
tok_to_orig_index
[
len
(
spans
)
*
doc_stride
+
i
]
encoded_dict
[
"paragraph_len"
]
=
paragraph_len
encoded_dict
[
"tokens"
]
=
tokens
encoded_dict
[
"token_to_orig_map"
]
=
token_to_orig_map
encoded_dict
[
"truncated_query_with_special_tokens_length"
]
=
len
(
truncated_query
)
+
sequence_added_tokens
encoded_dict
[
"token_is_max_context"
]
=
{}
encoded_dict
[
"start"
]
=
len
(
spans
)
*
doc_stride
encoded_dict
[
"length"
]
=
paragraph_len
spans
.
append
(
encoded_dict
)
if
"overflowing_tokens"
not
in
encoded_dict
:
break
span_doc_tokens
=
encoded_dict
[
"overflowing_tokens"
]
for
doc_span_index
in
range
(
len
(
spans
)):
for
j
in
range
(
spans
[
doc_span_index
][
"paragraph_len"
]):
is_max_context
=
_new_check_is_max_context
(
spans
,
doc_span_index
,
doc_span_index
*
doc_stride
+
j
)
index
=
j
if
tokenizer
.
padding_side
==
"left"
else
spans
[
doc_span_index
][
"truncated_query_with_special_tokens_length"
]
+
j
spans
[
doc_span_index
][
"token_is_max_context"
][
index
]
=
is_max_context
for
span
in
spans
:
# Identify the position of the CLS token
cls_index
=
span
[
'input_ids'
].
index
(
tokenizer
.
cls_token_id
)
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# Original TF implem also keep the classification token (set to 0) (not sure why...)
p_mask
=
np
.
array
(
span
[
'token_type_ids'
])
p_mask
=
np
.
minimum
(
p_mask
,
1
)
if
tokenizer
.
padding_side
==
"right"
:
# Limit positive values to one
p_mask
=
1
-
p_mask
p_mask
[
np
.
where
(
np
.
array
(
span
[
"input_ids"
])
==
tokenizer
.
sep_token_id
)[
0
]]
=
1
# Set the CLS index to '0'
p_mask
[
cls_index
]
=
0
span_is_impossible
=
example
.
is_impossible
start_position
=
0
end_position
=
0
if
is_training
and
not
span_is_impossible
:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start
=
span
[
"start"
]
doc_end
=
span
[
"start"
]
+
span
[
"length"
]
-
1
out_of_span
=
False
if
not
(
tok_start_position
>=
doc_start
and
tok_end_position
<=
doc_end
):
out_of_span
=
True
if
out_of_span
:
start_position
=
cls_index
end_position
=
cls_index
span_is_impossible
=
True
else
:
if
tokenizer
.
padding_side
==
"left"
:
doc_offset
=
0
else
:
doc_offset
=
len
(
truncated_query
)
+
sequence_added_tokens
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
features
.
append
(
SquadFeatures
(
span
[
'input_ids'
],
span
[
'attention_mask'
],
span
[
'token_type_ids'
],
cls_index
,
p_mask
.
tolist
(),
example_index
=
example_index
,
unique_id
=
unique_id
,
paragraph_len
=
span
[
'paragraph_len'
],
token_is_max_context
=
span
[
"token_is_max_context"
],
tokens
=
span
[
"tokens"
],
token_to_orig_map
=
span
[
"token_to_orig_map"
],
start_position
=
start_position
,
end_position
=
end_position
))
unique_id
+=
1
if
return_dataset
==
'pt'
:
if
not
is_torch_available
():
raise
ImportError
(
"Pytorch must be installed to return a pytorch dataset."
)
# Convert to Tensors and build dataset
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
attention_mask
for
f
in
features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
token_type_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_cls_index
=
torch
.
tensor
([
f
.
cls_index
for
f
in
features
],
dtype
=
torch
.
long
)
all_p_mask
=
torch
.
tensor
([
f
.
p_mask
for
f
in
features
],
dtype
=
torch
.
float
)
if
not
is_training
:
all_example_index
=
torch
.
arange
(
all_input_ids
.
size
(
0
),
dtype
=
torch
.
long
)
dataset
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_example_index
,
all_cls_index
,
all_p_mask
)
else
:
all_start_positions
=
torch
.
tensor
([
f
.
start_position
for
f
in
features
],
dtype
=
torch
.
long
)
all_end_positions
=
torch
.
tensor
([
f
.
end_position
for
f
in
features
],
dtype
=
torch
.
long
)
dataset
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_start_positions
,
all_end_positions
,
all_cls_index
,
all_p_mask
)
return
features
,
dataset
return
features
class
SquadProcessor
(
DataProcessor
):
"""
Processor for the SQuAD data set.
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
"""
train_file
=
None
dev_file
=
None
def
_get_example_from_tensor_dict
(
self
,
tensor_dict
,
evaluate
=
False
):
if
not
evaluate
:
answer
=
tensor_dict
[
'answers'
][
'text'
][
0
].
numpy
().
decode
(
'utf-8'
)
answer_start
=
tensor_dict
[
'answers'
][
'answer_start'
][
0
].
numpy
()
answers
=
[]
else
:
answers
=
[{
"answer_start"
:
start
.
numpy
(),
"text"
:
text
.
numpy
().
decode
(
'utf-8'
)
}
for
start
,
text
in
zip
(
tensor_dict
[
'answers'
][
"answer_start"
],
tensor_dict
[
'answers'
][
"text"
])]
answer
=
None
answer_start
=
None
return
SquadExample
(
qas_id
=
tensor_dict
[
'id'
].
numpy
().
decode
(
"utf-8"
),
question_text
=
tensor_dict
[
'question'
].
numpy
().
decode
(
'utf-8'
),
context_text
=
tensor_dict
[
'context'
].
numpy
().
decode
(
'utf-8'
),
answer_text
=
answer
,
start_position_character
=
answer_start
,
title
=
tensor_dict
[
'title'
].
numpy
().
decode
(
'utf-8'
),
answers
=
answers
)
def
get_examples_from_dataset
(
self
,
dataset
,
evaluate
=
False
):
"""
Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
Args:
dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
evaluate: boolean specifying if in evaluation mode or in training mode
Returns:
List of SquadExample
Examples::
import tensorflow_datasets as tfds
dataset = tfds.load("squad")
training_examples = get_examples_from_dataset(dataset, evaluate=False)
evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
"""
if
evaluate
:
dataset
=
dataset
[
"validation"
]
else
:
dataset
=
dataset
[
"train"
]
examples
=
[]
for
tensor_dict
in
tqdm
(
dataset
):
examples
.
append
(
self
.
_get_example_from_tensor_dict
(
tensor_dict
,
evaluate
=
evaluate
))
return
examples
def
get_train_examples
(
self
,
data_dir
,
filename
=
None
):
"""
Returns the training examples from the data directory.
Args:
data_dir: Directory containing the data files used for training and evaluating.
filename: None by default, specify this if the training file has a different name than the original one
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
"""
if
self
.
train_file
is
None
:
raise
ValueError
(
"SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor"
)
with
open
(
os
.
path
.
join
(
data_dir
,
self
.
train_file
if
filename
is
None
else
filename
),
"r"
,
encoding
=
'utf-8'
)
as
reader
:
input_data
=
json
.
load
(
reader
)[
"data"
]
return
self
.
_create_examples
(
input_data
,
"train"
)
def
get_dev_examples
(
self
,
data_dir
,
filename
=
None
):
"""
Returns the evaluation example from the data directory.
Args:
data_dir: Directory containing the data files used for training and evaluating.
filename: None by default, specify this if the evaluation file has a different name than the original one
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
"""
if
self
.
dev_file
is
None
:
raise
ValueError
(
"SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor"
)
with
open
(
os
.
path
.
join
(
data_dir
,
self
.
dev_file
if
filename
is
None
else
filename
),
"r"
,
encoding
=
'utf-8'
)
as
reader
:
input_data
=
json
.
load
(
reader
)[
"data"
]
return
self
.
_create_examples
(
input_data
,
"dev"
)
def
_create_examples
(
self
,
input_data
,
set_type
):
is_training
=
set_type
==
"train"
examples
=
[]
for
entry
in
tqdm
(
input_data
):
title
=
entry
[
'title'
]
for
paragraph
in
entry
[
"paragraphs"
]:
context_text
=
paragraph
[
"context"
]
for
qa
in
paragraph
[
"qas"
]:
qas_id
=
qa
[
"id"
]
question_text
=
qa
[
"question"
]
start_position_character
=
None
answer_text
=
None
answers
=
[]
if
"is_impossible"
in
qa
:
is_impossible
=
qa
[
"is_impossible"
]
else
:
is_impossible
=
False
if
not
is_impossible
:
if
is_training
:
answer
=
qa
[
"answers"
][
0
]
answer_text
=
answer
[
'text'
]
start_position_character
=
answer
[
'answer_start'
]
else
:
answers
=
qa
[
"answers"
]
example
=
SquadExample
(
qas_id
=
qas_id
,
question_text
=
question_text
,
context_text
=
context_text
,
answer_text
=
answer_text
,
start_position_character
=
start_position_character
,
title
=
title
,
is_impossible
=
is_impossible
,
answers
=
answers
)
examples
.
append
(
example
)
return
examples
class
SquadV1Processor
(
SquadProcessor
):
train_file
=
"train-v1.1.json"
dev_file
=
"dev-v1.1.json"
class
SquadV2Processor
(
SquadProcessor
):
train_file
=
"train-v2.0.json"
dev_file
=
"dev-v2.0.json"
class
SquadExample
(
object
):
"""
A single training/test example for the Squad dataset, as loaded from disk.
Args:
qas_id: The example's unique identifier
question_text: The question string
context_text: The context string
answer_text: The answer string
start_position_character: The character position of the start of the answer
title: The title of the example
answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
is_impossible: False by default, set to True if the example has no possible answer.
"""
def
__init__
(
self
,
qas_id
,
question_text
,
context_text
,
answer_text
,
start_position_character
,
title
,
answers
=
[],
is_impossible
=
False
):
self
.
qas_id
=
qas_id
self
.
question_text
=
question_text
self
.
context_text
=
context_text
self
.
answer_text
=
answer_text
self
.
title
=
title
self
.
is_impossible
=
is_impossible
self
.
answers
=
answers
self
.
start_position
,
self
.
end_position
=
0
,
0
doc_tokens
=
[]
char_to_word_offset
=
[]
prev_is_whitespace
=
True
# Split on whitespace so that different tokens may be attributed to their original position.
for
c
in
self
.
context_text
:
if
_is_whitespace
(
c
):
prev_is_whitespace
=
True
else
:
if
prev_is_whitespace
:
doc_tokens
.
append
(
c
)
else
:
doc_tokens
[
-
1
]
+=
c
prev_is_whitespace
=
False
char_to_word_offset
.
append
(
len
(
doc_tokens
)
-
1
)
self
.
doc_tokens
=
doc_tokens
self
.
char_to_word_offset
=
char_to_word_offset
# Start end end positions only has a value during evaluation.
if
start_position_character
is
not
None
and
not
is_impossible
:
self
.
start_position
=
char_to_word_offset
[
start_position_character
]
self
.
end_position
=
char_to_word_offset
[
start_position_character
+
len
(
answer_text
)
-
1
]
class
SquadFeatures
(
object
):
"""
Single squad example features to be fed to a model.
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding token indices.
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
cls_index: the index of the CLS token.
p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
example_index: the index of the example
unique_id: The unique Feature identifier
paragraph_len: The length of the context
token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
If a token does not have their maximum context in this feature object, it means that another feature object
has more information related to that token and should be prioritized over this feature for that token.
tokens: list of tokens corresponding to the input ids
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
start_position: start of the answer token index
end_position: end of the answer token index
"""
def
__init__
(
self
,
input_ids
,
attention_mask
,
token_type_ids
,
cls_index
,
p_mask
,
example_index
,
unique_id
,
paragraph_len
,
token_is_max_context
,
tokens
,
token_to_orig_map
,
start_position
,
end_position
):
self
.
input_ids
=
input_ids
self
.
attention_mask
=
attention_mask
self
.
token_type_ids
=
token_type_ids
self
.
cls_index
=
cls_index
self
.
p_mask
=
p_mask
self
.
example_index
=
example_index
self
.
unique_id
=
unique_id
self
.
paragraph_len
=
paragraph_len
self
.
token_is_max_context
=
token_is_max_context
self
.
tokens
=
tokens
self
.
token_to_orig_map
=
token_to_orig_map
self
.
start_position
=
start_position
self
.
end_position
=
end_position
class
SquadResult
(
object
):
"""
Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
Args:
unique_id: The unique identifier corresponding to that example.
start_logits: The logits corresponding to the start of the answer
end_logits: The logits corresponding to the end of the answer
"""
def
__init__
(
self
,
unique_id
,
start_logits
,
end_logits
,
start_top_index
=
None
,
end_top_index
=
None
,
cls_logits
=
None
):
self
.
start_logits
=
start_logits
self
.
end_logits
=
end_logits
self
.
unique_id
=
unique_id
if
start_top_index
:
self
.
start_top_index
=
start_top_index
self
.
end_top_index
=
end_top_index
self
.
cls_logits
=
cls_logits
\ No newline at end of file
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment