Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
72c36b9e
Commit
72c36b9e
authored
Oct 16, 2019
by
thomwolf
Committed by
Morgan Funtowicz
Dec 10, 2019
Browse files
[WIP] - CLI
parent
e57d00ee
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
631 additions
and
127 deletions
+631
-127
setup.py
setup.py
+4
-4
transformers-cli
transformers-cli
+3
-2
transformers/__init__.py
transformers/__init__.py
+2
-0
transformers/__main__.py
transformers/__main__.py
+26
-119
transformers/commands/convert.py
transformers/commands/convert.py
+115
-0
transformers/commands/serving.py
transformers/commands/serving.py
+176
-0
transformers/commands/train.py
transformers/commands/train.py
+121
-0
transformers/data/__init__.py
transformers/data/__init__.py
+1
-1
transformers/data/processors/__init__.py
transformers/data/processors/__init__.py
+1
-1
transformers/data/processors/utils.py
transformers/data/processors/utils.py
+182
-0
No files found.
setup.py
View file @
72c36b9e
...
@@ -62,15 +62,15 @@ setup(
...
@@ -62,15 +62,15 @@ setup(
'regex'
,
'regex'
,
'sentencepiece'
,
'sentencepiece'
,
'sacremoses'
],
'sacremoses'
],
extras_require
=
extras
,
scripts
=
[
'transformers-cli'
],
entry_points
=
{
entry_points
=
{
'console_scripts'
:
[
'console_scripts'
:
[
"transformers=transformers.__main__:main"
,
"transformers=transformers.__main__:main"
,
]
]
},
},
extras_require
=
extras
,
scripts
=
[
'transformers-cli'
],
# python_requires='>=3.5.0',
# python_requires='>=3.5.0',
classifiers
=
[
classifiers
=
[
'Intended Audience :: Science/Research'
,
'Intended Audience :: Science/Research'
,
...
...
transformers-cli
View file @
72c36b9e
#!/usr/bin/env python
#!/usr/bin/env python
from
argparse
import
ArgumentParser
from
argparse
import
ArgumentParser
from
transformers.commands.serving
import
ServeCommand
from
transformers.commands.user
import
UserCommands
from
transformers.commands.user
import
UserCommands
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
ArgumentParser
(
description
=
'Transformers CLI tool'
,
usage
=
'transformers-cli <command> [<args>]'
)
parser
=
ArgumentParser
(
'Transformers CLI tool'
,
usage
=
'transformers-cli <command> [<args>]'
)
commands_parser
=
parser
.
add_subparsers
(
help
=
'transformers-cli command helpers'
)
commands_parser
=
parser
.
add_subparsers
(
help
=
'transformers-cli command helpers'
)
# Register commands
# Register commands
ServeCommand
.
register_subcommand
(
commands_parser
)
UserCommands
.
register_subcommand
(
commands_parser
)
UserCommands
.
register_subcommand
(
commands_parser
)
# Let's go
# Let's go
...
...
transformers/__init__.py
View file @
72c36b9e
...
@@ -24,6 +24,8 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH
...
@@ -24,6 +24,8 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH
from
.data
import
(
is_sklearn_available
,
from
.data
import
(
is_sklearn_available
,
InputExample
,
InputFeatures
,
DataProcessor
,
InputExample
,
InputFeatures
,
DataProcessor
,
SingleSentenceClassificationProcessor
,
convert_examples_to_features
,
glue_output_modes
,
glue_convert_examples_to_features
,
glue_output_modes
,
glue_convert_examples_to_features
,
glue_processors
,
glue_tasks_num_labels
,
glue_processors
,
glue_tasks_num_labels
,
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
,
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
,
...
...
transformers/__main__.py
View file @
72c36b9e
# coding: utf8
# coding: utf8
def
main
():
def
main
():
import
sys
import
sys
if
(
len
(
sys
.
argv
)
<
4
or
len
(
sys
.
argv
)
>
6
)
or
sys
.
argv
[
1
]
not
in
[
"
b
ert"
,
"
gpt"
,
"transfo_xl"
,
"gpt2"
,
"xlnet"
,
"xlm
"
]:
if
len
(
sys
.
argv
)
<
2
or
sys
.
argv
[
1
]
not
in
[
"
conv
ert"
,
"
train"
,
"predict"
,
"serve
"
]:
print
(
print
(
"This command line utility let you convert original (author released) model checkpoint to pytorch.
\n
"
"First argument to `transformers` command line interface should be one of:
\n
"
"It should be used as one of:
\n
"
">> convert serve train predict"
)
">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT,
\n
"
if
sys
.
argv
[
1
]
==
"convert"
:
">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG],
\n
"
from
transformers.commands
import
convert
">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or
\n
"
convert
(
sys
.
argv
)
">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or
\n
"
elif
sys
.
argv
[
1
]
==
"train"
:
">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or
\n
"
from
transformers.commands
import
train
">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT"
)
train
(
sys
.
argv
)
else
:
elif
sys
.
argv
[
1
]
==
"serve"
:
if
sys
.
argv
[
1
]
==
"bert"
:
pass
try
:
# from argparse import ArgumentParser
from
.convert_bert_original_tf_checkpoint_to_pytorch
import
convert_tf_checkpoint_to_pytorch
# from transformers.commands.serving import ServeCommand
except
ImportError
:
# parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
print
(
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
# commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
if
len
(
sys
.
argv
)
!=
5
:
# pylint: disable=line-too-long
print
(
"Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`"
)
else
:
PYTORCH_DUMP_OUTPUT
=
sys
.
argv
.
pop
()
TF_CONFIG
=
sys
.
argv
.
pop
()
TF_CHECKPOINT
=
sys
.
argv
.
pop
()
convert_tf_checkpoint_to_pytorch
(
TF_CHECKPOINT
,
TF_CONFIG
,
PYTORCH_DUMP_OUTPUT
)
elif
sys
.
argv
[
1
]
==
"gpt"
:
from
.convert_openai_original_tf_checkpoint_to_pytorch
import
convert_openai_checkpoint_to_pytorch
if
len
(
sys
.
argv
)
<
4
or
len
(
sys
.
argv
)
>
5
:
# pylint: disable=line-too-long
print
(
"Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`"
)
else
:
OPENAI_GPT_CHECKPOINT_FOLDER_PATH
=
sys
.
argv
[
2
]
PYTORCH_DUMP_OUTPUT
=
sys
.
argv
[
3
]
if
len
(
sys
.
argv
)
==
5
:
OPENAI_GPT_CONFIG
=
sys
.
argv
[
4
]
else
:
OPENAI_GPT_CONFIG
=
""
convert_openai_checkpoint_to_pytorch
(
OPENAI_GPT_CHECKPOINT_FOLDER_PATH
,
OPENAI_GPT_CONFIG
,
PYTORCH_DUMP_OUTPUT
)
elif
sys
.
argv
[
1
]
==
"transfo_xl"
:
try
:
from
.convert_transfo_xl_original_tf_checkpoint_to_pytorch
import
convert_transfo_xl_checkpoint_to_pytorch
except
ImportError
:
print
(
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
if
len
(
sys
.
argv
)
<
4
or
len
(
sys
.
argv
)
>
5
:
# pylint: disable=line-too-long
print
(
"Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`"
)
else
:
if
'ckpt'
in
sys
.
argv
[
2
].
lower
():
TF_CHECKPOINT
=
sys
.
argv
[
2
]
TF_DATASET_FILE
=
""
else
:
TF_DATASET_FILE
=
sys
.
argv
[
2
]
TF_CHECKPOINT
=
""
PYTORCH_DUMP_OUTPUT
=
sys
.
argv
[
3
]
if
len
(
sys
.
argv
)
==
5
:
TF_CONFIG
=
sys
.
argv
[
4
]
else
:
TF_CONFIG
=
""
convert_transfo_xl_checkpoint_to_pytorch
(
TF_CHECKPOINT
,
TF_CONFIG
,
PYTORCH_DUMP_OUTPUT
,
TF_DATASET_FILE
)
elif
sys
.
argv
[
1
]
==
"gpt2"
:
try
:
from
.convert_gpt2_original_tf_checkpoint_to_pytorch
import
convert_gpt2_checkpoint_to_pytorch
except
ImportError
:
print
(
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
if
len
(
sys
.
argv
)
<
4
or
len
(
sys
.
argv
)
>
5
:
# pylint: disable=line-too-long
print
(
"Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`"
)
else
:
TF_CHECKPOINT
=
sys
.
argv
[
2
]
PYTORCH_DUMP_OUTPUT
=
sys
.
argv
[
3
]
if
len
(
sys
.
argv
)
==
5
:
TF_CONFIG
=
sys
.
argv
[
4
]
else
:
TF_CONFIG
=
""
convert_gpt2_checkpoint_to_pytorch
(
TF_CHECKPOINT
,
TF_CONFIG
,
PYTORCH_DUMP_OUTPUT
)
elif
sys
.
argv
[
1
]
==
"xlnet"
:
try
:
from
.convert_xlnet_original_tf_checkpoint_to_pytorch
import
convert_xlnet_checkpoint_to_pytorch
except
ImportError
:
print
(
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
if
len
(
sys
.
argv
)
<
5
or
len
(
sys
.
argv
)
>
6
:
# pylint: disable=line-too-long
print
(
"Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`"
)
else
:
TF_CHECKPOINT
=
sys
.
argv
[
2
]
TF_CONFIG
=
sys
.
argv
[
3
]
PYTORCH_DUMP_OUTPUT
=
sys
.
argv
[
4
]
if
len
(
sys
.
argv
)
==
6
:
FINETUNING_TASK
=
sys
.
argv
[
5
]
else
:
FINETUNING_TASK
=
None
convert_xlnet_checkpoint_to_pytorch
(
TF_CHECKPOINT
,
# # Register commands
TF_CONFIG
,
# ServeCommand.register_subcommand(commands_parser)
PYTORCH_DUMP_OUTPUT
,
FINETUNING_TASK
)
elif
sys
.
argv
[
1
]
==
"xlm"
:
from
.convert_xlm_original_pytorch_checkpoint_to_pytorch
import
convert_xlm_checkpoint_to_pytorch
if
len
(
sys
.
argv
)
!=
4
:
# # Let's go
# pylint: disable=line-too-long
# args = parser.parse_args()
print
(
"Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`"
)
else
:
XLM_CHECKPOINT_PATH
=
sys
.
argv
[
2
]
PYTORCH_DUMP_OUTPUT
=
sys
.
argv
[
3
]
convert_xlm_checkpoint_to_pytorch
(
XLM_CHECKPOINT_PATH
,
PYTORCH_DUMP_OUTPUT
)
# if not hasattr(args, 'func'):
# parser.print_help()
# exit(1)
# # Run
# service = args.func(args)
# service.run()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
main
()
main
()
transformers/commands/convert.py
0 → 100644
View file @
72c36b9e
from
argparse
import
ArgumentParser
,
Namespace
from
logging
import
getLogger
from
transformers
import
AutoModel
,
AutoTokenizer
from
transformers.commands
import
BaseTransformersCLICommand
def
convert_command_factory
(
args
:
Namespace
):
"""
Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
:return: ServeCommand
"""
return
ConvertCommand
(
args
.
model_type
,
args
.
tf_checkpoint
,
args
.
pytorch_dump_output
,
args
.
config
,
args
.
finetuning_task_name
)
class
ConvertCommand
(
BaseTransformersCLICommand
):
@
staticmethod
def
register_subcommand
(
parser
:
ArgumentParser
):
"""
Register this command to argparse so it's available for the transformer-cli
:param parser: Root parser to register command-specific arguments
:return:
"""
train_parser
=
parser
.
add_parser
(
'convert'
,
help
=
"CLI tool to run convert model from original "
"author checkpoints to Transformesr PyTorch checkpoints."
)
train_parser
.
add_argument
(
'--model_type'
,
type
=
str
,
required
=
True
,
help
=
'Model
\'
s type.'
)
train_parser
.
add_argument
(
'--tf_checkpoint'
,
type
=
str
,
required
=
True
,
help
=
'TensorFlow checkpoint path or folder.'
)
train_parser
.
add_argument
(
'--pytorch_dump_output'
,
type
=
str
,
required
=
True
,
help
=
'Path to the PyTorch savd model output.'
)
train_parser
.
add_argument
(
'--config'
,
type
=
str
,
default
=
""
,
help
=
'Configuration file path or folder.'
)
train_parser
.
add_argument
(
'--finetuning_task_name'
,
type
=
str
,
default
=
None
,
help
=
'Optional fine-tuning task name if the TF model was a finetuned model.'
)
train_parser
.
set_defaults
(
func
=
convert_command_factory
)
def
__init__
(
self
,
model_type
:
str
,
tf_checkpoint
:
str
,
pytorch_dump_output
:
str
,
config
:
str
,
finetuning_task_name
:
str
,
*
args
):
self
.
_logger
=
getLogger
(
'transformers-cli/converting'
)
self
.
_logger
.
info
(
'Loading model {}'
.
format
(
model_type
))
self
.
_model_type
=
model_type
self
.
_tf_checkpoint
=
tf_checkpoint
self
.
_pytorch_dump_output
=
pytorch_dump_output
self
.
_config
=
config
self
.
_finetuning_task_name
=
finetuning_task_name
def
run
(
self
):
if
self
.
_model_type
==
"bert"
:
try
:
from
transformers.convert_bert_original_tf_checkpoint_to_pytorch
import
convert_tf_checkpoint_to_pytorch
except
ImportError
:
msg
=
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
\
"In that case, it requires TensorFlow to be installed. Please see "
\
"https://www.tensorflow.org/install/ for installation instructions."
raise
ImportError
(
msg
)
convert_tf_checkpoint_to_pytorch
(
self
.
_tf_checkpoint
,
self
.
_config
,
self
.
_pytorch_dump_output
)
elif
self
.
_model_type
==
"gpt"
:
from
transformers.convert_openai_original_tf_checkpoint_to_pytorch
import
convert_openai_checkpoint_to_pytorch
convert_openai_checkpoint_to_pytorch
(
self
.
_tf_checkpoint
,
self
.
_config
,
self
.
_pytorch_dump_output
)
elif
self
.
_model_type
==
"transfo_xl"
:
try
:
from
transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch
import
convert_transfo_xl_checkpoint_to_pytorch
except
ImportError
:
msg
=
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
\
"In that case, it requires TensorFlow to be installed. Please see "
\
"https://www.tensorflow.org/install/ for installation instructions."
raise
ImportError
(
msg
)
if
'ckpt'
in
self
.
_tf_checkpoint
.
lower
():
TF_CHECKPOINT
=
self
.
_tf_checkpoint
TF_DATASET_FILE
=
""
else
:
TF_DATASET_FILE
=
self
.
_tf_checkpoint
TF_CHECKPOINT
=
""
convert_transfo_xl_checkpoint_to_pytorch
(
TF_CHECKPOINT
,
self
.
_config
,
self
.
_pytorch_dump_output
,
TF_DATASET_FILE
)
elif
self
.
_model_type
==
"gpt2"
:
try
:
from
transformers.convert_gpt2_original_tf_checkpoint_to_pytorch
import
convert_gpt2_checkpoint_to_pytorch
except
ImportError
:
msg
=
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
\
"In that case, it requires TensorFlow to be installed. Please see "
\
"https://www.tensorflow.org/install/ for installation instructions."
raise
ImportError
(
msg
)
convert_gpt2_checkpoint_to_pytorch
(
self
.
_tf_checkpoint
,
self
.
_config
,
self
.
_pytorch_dump_output
)
elif
self
.
_model_type
==
"xlnet"
:
try
:
from
transformers.convert_xlnet_original_tf_checkpoint_to_pytorch
import
convert_xlnet_checkpoint_to_pytorch
except
ImportError
:
msg
=
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
\
"In that case, it requires TensorFlow to be installed. Please see "
\
"https://www.tensorflow.org/install/ for installation instructions."
raise
ImportError
(
msg
)
convert_xlnet_checkpoint_to_pytorch
(
self
.
_tf_checkpoint
,
self
.
_config
,
self
.
_pytorch_dump_output
,
self
.
_finetuning_task_name
)
elif
self
.
_model_type
==
"xlm"
:
from
transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch
import
convert_xlm_checkpoint_to_pytorch
convert_xlm_checkpoint_to_pytorch
(
self
.
_tf_checkpoint
,
self
.
_pytorch_dump_output
)
else
:
raise
ValueError
(
"--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]"
)
transformers/commands/serving.py
0 → 100644
View file @
72c36b9e
from
argparse
import
ArgumentParser
,
Namespace
from
typing
import
List
,
Optional
,
Union
,
Any
import
torch
from
fastapi
import
FastAPI
,
HTTPException
,
Body
from
logging
import
getLogger
from
pydantic
import
BaseModel
from
uvicorn
import
run
from
transformers
import
AutoModel
,
AutoTokenizer
,
AutoConfig
from
transformers.commands
import
BaseTransformersCLICommand
def
serve_command_factory
(
args
:
Namespace
):
"""
Factory function used to instantiate serving server from provided command line arguments.
:return: ServeCommand
"""
return
ServeCommand
(
args
.
host
,
args
.
port
,
args
.
model
,
args
.
graphql
)
class
ServeResult
(
BaseModel
):
"""
Base class for serving result
"""
model
:
str
class
ServeModelInfoResult
(
ServeResult
):
"""
Expose model information
"""
infos
:
dict
class
ServeTokenizeResult
(
ServeResult
):
"""
Tokenize result model
"""
tokens
:
List
[
str
]
tokens_ids
:
Optional
[
List
[
int
]]
class
ServeDeTokenizeResult
(
ServeResult
):
"""
DeTokenize result model
"""
text
:
str
class
ServeForwardResult
(
ServeResult
):
"""
Forward result model
"""
tokens
:
List
[
str
]
tokens_ids
:
List
[
int
]
output
:
Any
class
ServeCommand
(
BaseTransformersCLICommand
):
@
staticmethod
def
register_subcommand
(
parser
:
ArgumentParser
):
"""
Register this command to argparse so it's available for the transformer-cli
:param parser: Root parser to register command-specific arguments
:return:
"""
serve_parser
=
parser
.
add_parser
(
'serve'
,
help
=
'CLI tool to run inference requests through REST and GraphQL endpoints.'
)
serve_parser
.
add_argument
(
'--host'
,
type
=
str
,
default
=
'localhost'
,
help
=
'Interface the server will listen on.'
)
serve_parser
.
add_argument
(
'--port'
,
type
=
int
,
default
=
8888
,
help
=
'Port the serving will listen to.'
)
serve_parser
.
add_argument
(
'--model'
,
type
=
str
,
required
=
True
,
help
=
'Model
\'
s name or path to stored model to infer from.'
)
serve_parser
.
add_argument
(
'--graphql'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Enable GraphQL endpoints.'
)
serve_parser
.
set_defaults
(
func
=
serve_command_factory
)
def
__init__
(
self
,
host
:
str
,
port
:
int
,
model
:
str
,
graphql
:
bool
):
self
.
_logger
=
getLogger
(
'transformers-cli/serving'
)
self
.
_logger
.
info
(
'Loading model {}'
.
format
(
model
))
self
.
_model_name
=
model
self
.
_model
=
AutoModel
.
from_pretrained
(
model
)
self
.
_tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
self
.
_logger
.
info
(
'Serving model over {}:{}'
.
format
(
host
,
port
))
self
.
_host
=
host
self
.
_port
=
port
self
.
_app
=
FastAPI
()
# Register routes
self
.
_app
.
add_api_route
(
'/'
,
self
.
model_info
,
response_model
=
ServeModelInfoResult
,
methods
=
[
'GET'
])
self
.
_app
.
add_api_route
(
'/tokenize'
,
self
.
tokenize
,
response_model
=
ServeTokenizeResult
,
methods
=
[
'POST'
])
self
.
_app
.
add_api_route
(
'/detokenize'
,
self
.
detokenize
,
response_model
=
ServeDeTokenizeResult
,
methods
=
[
'POST'
])
self
.
_app
.
add_api_route
(
'/forward'
,
self
.
forward
,
response_model
=
ServeForwardResult
,
methods
=
[
'POST'
])
def
run
(
self
):
run
(
self
.
_app
,
host
=
self
.
_host
,
port
=
self
.
_port
)
def
model_info
(
self
):
return
ServeModelInfoResult
(
model
=
self
.
_model_name
,
infos
=
vars
(
self
.
_model
.
config
))
def
tokenize
(
self
,
text_input
:
str
=
Body
(
None
,
embed
=
True
),
return_ids
:
bool
=
Body
(
False
,
embed
=
True
)):
"""
Tokenize the provided input and eventually returns corresponding tokens id:
- **text_input**: String to tokenize
- **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping.
"""
try
:
tokens_txt
=
self
.
_tokenizer
.
tokenize
(
text_input
)
if
return_ids
:
tokens_ids
=
self
.
_tokenizer
.
convert_tokens_to_ids
(
tokens_txt
)
return
ServeTokenizeResult
(
model
=
self
.
_model_name
,
tokens
=
tokens_txt
,
tokens_ids
=
tokens_ids
)
else
:
return
ServeTokenizeResult
(
model
=
self
.
_model_name
,
tokens
=
tokens_txt
)
except
Exception
as
e
:
raise
HTTPException
(
status_code
=
500
,
detail
=
{
"model"
:
self
.
_model_name
,
"error"
:
str
(
e
)})
def
detokenize
(
self
,
tokens_ids
:
List
[
int
]
=
Body
(
None
,
embed
=
True
),
skip_special_tokens
:
bool
=
Body
(
False
,
embed
=
True
),
cleanup_tokenization_spaces
:
bool
=
Body
(
True
,
embed
=
True
)):
"""
Detokenize the provided tokens ids to readable text:
- **tokens_ids**: List of tokens ids
- **skip_special_tokens**: Flag indicating to not try to decode special tokens
- **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones.
"""
try
:
decoded_str
=
self
.
_tokenizer
.
decode
(
tokens_ids
,
skip_special_tokens
,
cleanup_tokenization_spaces
)
return
ServeDeTokenizeResult
(
model
=
self
.
_model_name
,
text
=
decoded_str
)
except
Exception
as
e
:
raise
HTTPException
(
status_code
=
500
,
detail
=
{
"model"
:
self
.
_model_name
,
"error"
:
str
(
e
)})
def
forward
(
self
,
inputs
:
Union
[
str
,
List
[
str
],
List
[
int
]]
=
Body
(
None
,
embed
=
True
),
attention_mask
:
Optional
[
List
[
int
]]
=
Body
(
None
,
embed
=
True
),
tokens_type_ids
:
Optional
[
List
[
int
]]
=
Body
(
None
,
embed
=
True
)):
"""
**inputs**:
**attention_mask**:
**tokens_type_ids**:
"""
# Check we don't have empty string
if
len
(
inputs
)
==
0
:
return
ServeForwardResult
(
model
=
self
.
_model_name
,
output
=
[],
attention
=
[])
if
isinstance
(
inputs
,
str
):
inputs_tokens
=
self
.
_tokenizer
.
tokenize
(
inputs
)
inputs_ids
=
self
.
_tokenizer
.
convert_tokens_to_ids
(
inputs_tokens
)
elif
isinstance
(
inputs
,
List
):
if
isinstance
(
inputs
[
0
],
str
):
inputs_tokens
=
inputs
inputs_ids
=
self
.
_tokenizer
.
convert_tokens_to_ids
(
inputs_tokens
)
elif
isinstance
(
inputs
[
0
],
int
):
inputs_tokens
=
[]
inputs_ids
=
inputs
else
:
error_msg
=
"inputs should be string, [str] of [int] (got {})"
.
format
(
type
(
inputs
[
0
]))
raise
HTTPException
(
423
,
detail
=
{
"error"
:
error_msg
})
else
:
error_msg
=
"inputs should be string, [str] of [int] (got {})"
.
format
(
type
(
inputs
))
raise
HTTPException
(
423
,
detail
=
{
"error"
:
error_msg
})
try
:
# Forward through the model
t_input_ids
=
torch
.
tensor
(
inputs_ids
).
unsqueeze
(
0
)
output
=
self
.
_model
(
t_input_ids
,
attention_mask
,
tokens_type_ids
)
return
ServeForwardResult
(
model
=
self
.
_model_name
,
tokens
=
inputs_tokens
,
tokens_ids
=
inputs_ids
,
output
=
output
[
0
].
tolist
()
)
except
Exception
as
e
:
raise
HTTPException
(
500
,
{
"error"
:
str
(
e
)})
transformers/commands/train.py
0 → 100644
View file @
72c36b9e
from
argparse
import
ArgumentParser
,
Namespace
from
logging
import
getLogger
from
transformers.commands
import
BaseTransformersCLICommand
from
transformers
import
(
AutoTokenizer
,
is_tf_available
,
is_torch_available
,
SingleSentenceClassificationProcessor
,
convert_examples_to_features
)
if
is_tf_available
():
from
transformers
import
TFAutoModelForSequenceClassification
as
SequenceClassifModel
elif
is_torch_available
():
from
transformers
import
AutoModelForSequenceClassification
as
SequenceClassifModel
else
:
raise
ImportError
(
"At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training"
)
# TF training parameters
BATCH_SIZE
=
32
EVAL_BATCH_SIZE
=
BATCH_SIZE
*
2
USE_XLA
=
False
USE_AMP
=
False
def
train_command_factory
(
args
:
Namespace
):
"""
Factory function used to instantiate serving server from provided command line arguments.
:return: ServeCommand
"""
return
TrainCommand
(
args
.
model
)
class
TrainCommand
(
BaseTransformersCLICommand
):
@
staticmethod
def
register_subcommand
(
parser
:
ArgumentParser
):
"""
Register this command to argparse so it's available for the transformer-cli
:param parser: Root parser to register command-specific arguments
:return:
"""
train_parser
=
parser
.
add_parser
(
'train'
,
help
=
'CLI tool to train a model on a task.'
)
train_parser
.
add_argument
(
'--train_data'
,
type
=
str
,
required
=
True
,
help
=
'path to train (and optionally evaluation) dataset.'
)
train_parser
.
add_argument
(
'--task'
,
type
=
str
,
default
=
'text_classification'
,
help
=
'Task to train the model on.'
)
train_parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'bert-base-uncased'
,
help
=
'Model
\'
s name or path to stored model.'
)
train_parser
.
add_argument
(
'--valid_data'
,
type
=
str
,
default
=
''
,
help
=
'path to validation dataset.'
)
train_parser
.
add_argument
(
'--valid_data_ratio'
,
type
=
float
,
default
=
0.1
,
help
=
"if validation dataset is not provided, fraction of train dataset "
"to use as validation dataset."
)
train_parser
.
set_defaults
(
func
=
train_command_factory
)
def
__init__
(
self
,
model_name
:
str
,
task
:
str
,
train_data
:
str
,
valid_data
:
str
,
valid_data_ratio
:
float
):
self
.
_logger
=
getLogger
(
'transformers-cli/training'
)
self
.
_framework
=
'tf'
if
is_tf_available
()
else
'torch'
self
.
_logger
.
info
(
'Loading model {}'
.
format
(
model_name
))
self
.
_model_name
=
model_name
self
.
_tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
if
task
==
'text_classification'
:
self
.
_model
=
SequenceClassifModel
.
from_pretrained
(
model_name
)
elif
task
==
'token_classification'
:
raise
NotImplementedError
elif
task
==
'question_answering'
:
raise
NotImplementedError
dataset
=
SingleSentenceClassificationProcessor
.
create_from_csv
(
train_data
)
num_data_samples
=
len
(
SingleSentenceClassificationProcessor
)
if
valid_data
:
self
.
_train_dataset
=
dataset
self
.
_num_train_samples
=
num_data_samples
self
.
_valid_dataset
=
SingleSentenceClassificationProcessor
.
create_from_csv
(
valid_data
)
self
.
_num_valid_samples
=
len
(
self
.
_valid_dataset
)
else
:
assert
0.0
<
valid_data_ratio
<
1.0
,
"--valid_data_ratio should be between 0.0 and 1.0"
self
.
_num_valid_samples
=
num_data_samples
*
valid_data_ratio
self
.
_num_train_samples
=
num_data_samples
-
self
.
_num_valid_samples
self
.
_train_dataset
=
dataset
[
self
.
_num_train_samples
]
self
.
_valid_dataset
=
dataset
[
self
.
_num_valid_samples
]
def
run
(
self
):
if
self
.
_framework
==
'tf'
:
return
self
.
run_tf
()
return
self
.
run_torch
()
def
run_torch
(
self
):
raise
NotImplementedError
def
run_tf
(
self
):
import
tensorflow
as
tf
tf
.
config
.
optimizer
.
set_jit
(
USE_XLA
)
tf
.
config
.
optimizer
.
set_experimental_options
({
"auto_mixed_precision"
:
USE_AMP
})
# Prepare dataset as a tf.train_data.Dataset instance
train_dataset
=
convert_examples_to_features
(
self
.
_train_dataset
,
self
.
_tokenizer
,
mode
=
'sequence_classification'
)
valid_dataset
=
convert_examples_to_features
(
self
.
_valid_dataset
,
self
.
_tokenizer
,
mode
=
'sequence_classification'
)
train_dataset
=
train_dataset
.
shuffle
(
128
).
batch
(
BATCH_SIZE
).
repeat
(
-
1
)
valid_dataset
=
valid_dataset
.
batch
(
EVAL_BATCH_SIZE
)
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
opt
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
3e-5
,
epsilon
=
1e-08
)
if
USE_AMP
:
# loss scaling is currently required when using mixed precision
opt
=
tf
.
keras
.
mixed_precision
.
experimental
.
LossScaleOptimizer
(
opt
,
'dynamic'
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
metric
=
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
(
'accuracy'
)
model
.
compile
(
optimizer
=
opt
,
loss
=
loss
,
metrics
=
[
metric
])
# Train and evaluate using tf.keras.Model.fit()
train_steps
=
train_examples
//
BATCH_SIZE
valid_steps
=
valid_examples
//
EVAL_BATCH_SIZE
history
=
model
.
fit
(
train_dataset
,
epochs
=
2
,
steps_per_epoch
=
train_steps
,
validation_data
=
valid_dataset
,
validation_steps
=
valid_steps
)
# Save TF2 model
os
.
makedirs
(
'./save/'
,
exist_ok
=
True
)
model
.
save_pretrained
(
'./save/'
)
transformers/data/__init__.py
View file @
72c36b9e
from
.processors
import
InputExample
,
InputFeatures
,
DataProcessor
,
SquadFeatures
from
.processors
import
InputExample
,
InputFeatures
,
DataProcessor
,
SquadFeatures
,
SingleSentenceClassificationProcessor
from
.processors
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.processors
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.processors
import
squad_convert_examples_to_features
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
from
.processors
import
squad_convert_examples_to_features
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
from
.processors
import
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
from
.processors
import
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
...
...
transformers/data/processors/__init__.py
View file @
72c36b9e
from
.utils
import
InputExample
,
InputFeatures
,
DataProcessor
from
.utils
import
InputExample
,
InputFeatures
,
DataProcessor
,
SingleSentenceClassificationProcessor
,
convert_examples_to_features
from
.glue
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.glue
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.squad
import
squad_convert_examples_to_features
,
SquadFeatures
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
from
.squad
import
squad_convert_examples_to_features
,
SquadFeatures
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
from
.xnli
import
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
from
.xnli
import
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
\ No newline at end of file
transformers/data/processors/utils.py
View file @
72c36b9e
...
@@ -125,3 +125,185 @@ class DataProcessor(object):
...
@@ -125,3 +125,185 @@ class DataProcessor(object):
line
=
list
(
unicode
(
cell
,
'utf-8'
)
for
cell
in
line
)
line
=
list
(
unicode
(
cell
,
'utf-8'
)
for
cell
in
line
)
lines
.
append
(
line
)
lines
.
append
(
line
)
return
lines
return
lines
class
SingleSentenceClassificationProcessor
(
DataProcessor
):
""" Generic processor for a single sentence classification data set."""
def
__init__
(
self
,
labels
=
None
,
examples
=
None
):
self
.
labels
=
[]
if
labels
is
None
else
labels
self
.
examples
=
[]
if
examples
is
None
else
examples
@
classmethod
def
create_from_csv
(
cls
,
file_name
):
processor
=
cls
()
processor
.
add_examples_from_csv
(
file_name
)
return
processor
def
__len__
(
self
):
return
len
(
self
.
examples
)
def
__getitem__
(
self
,
idx
):
if
isinstance
(
idx
,
slice
):
return
SingleSentenceClassificationProcessor
(
labels
=
self
.
labels
,
examples
=
self
.
examples
[
idx
])
return
self
.
examples
[
idx
]
def
get_labels
(
self
):
"""Gets the list of labels for this data set."""
return
self
.
labels
def
add_examples_from_csv
(
self
,
file_name
):
lines
=
self
.
_read_tsv
(
file_name
)
self
.
add_examples_from_lines
(
lines
)
def
add_examples_from_lines
(
self
,
lines
,
split_name
=
''
,
overwrite_labels
=
False
,
overwrite_examples
=
False
):
"""Creates examples for the training and dev sets."""
added_labels
=
set
()
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
len
(
line
)
>
2
:
guid
=
"%s-%s"
%
(
split_name
,
line
[
0
])
if
split_name
else
line
[
0
]
label
=
line
[
1
]
text_a
=
line
[
2
]
else
:
guid
=
"%s-%s"
%
(
split_name
,
i
)
if
split_name
else
"%s"
%
i
label
=
line
[
0
]
text_a
=
line
[
1
]
added_labels
.
add
(
label
)
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
None
,
label
=
label
))
# Update examples
if
overwrite_examples
:
self
.
examples
=
examples
else
:
self
.
examples
.
extend
(
examples
)
# Update labels
if
overwrite_labels
:
self
.
labels
=
list
(
added_labels
)
else
:
self
.
labels
=
list
(
set
(
self
.
labels
).
union
(
added_labels
))
return
self
.
examples
def
convert_examples_to_features
(
examples
,
tokenizer
,
mode
=
'sequence_classification'
,
max_length
=
512
,
pad_on_left
=
False
,
pad_token
=
0
,
pad_token_segment_id
=
0
,
mask_padding_with_zero
=
True
):
"""
Loads a data file into a list of ``InputFeatures``
Args:
examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length
task: GLUE task
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
pad_token: Padding token
pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
actual values)
Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
containing the task-specific features. If the input is a list of ``InputExamples``, will return
a list of task-specific ``InputFeatures`` which can be fed to the model.
"""
is_tf_dataset
=
False
if
is_tf_available
()
and
isinstance
(
examples
,
tf
.
data
.
Dataset
):
is_tf_dataset
=
True
if
task
is
not
None
:
processor
=
glue_processors
[
task
]()
if
label_list
is
None
:
label_list
=
processor
.
get_labels
()
logger
.
info
(
"Using label list %s for task %s"
%
(
label_list
,
task
))
if
output_mode
is
None
:
output_mode
=
glue_output_modes
[
task
]
logger
.
info
(
"Using output mode %s for task %s"
%
(
output_mode
,
task
))
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
features
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
if
ex_index
%
10000
==
0
:
logger
.
info
(
"Writing example %d"
%
(
ex_index
))
if
is_tf_dataset
:
example
=
processor
.
get_example_from_tensor_dict
(
example
)
inputs
=
tokenizer
.
encode_plus
(
example
.
text_a
,
example
.
text_b
,
add_special_tokens
=
True
,
max_length
=
max_length
,
)
input_ids
,
token_type_ids
=
inputs
[
"input_ids"
],
inputs
[
"token_type_ids"
]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
attention_mask
=
[
1
if
mask_padding_with_zero
else
0
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
padding_length
=
max_length
-
len
(
input_ids
)
if
pad_on_left
:
input_ids
=
([
pad_token
]
*
padding_length
)
+
input_ids
attention_mask
=
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
+
attention_mask
token_type_ids
=
([
pad_token_segment_id
]
*
padding_length
)
+
token_type_ids
else
:
input_ids
=
input_ids
+
([
pad_token
]
*
padding_length
)
attention_mask
=
attention_mask
+
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
token_type_ids
=
token_type_ids
+
([
pad_token_segment_id
]
*
padding_length
)
assert
len
(
input_ids
)
==
max_length
,
"Error with input length {} vs {}"
.
format
(
len
(
input_ids
),
max_length
)
assert
len
(
attention_mask
)
==
max_length
,
"Error with input length {} vs {}"
.
format
(
len
(
attention_mask
),
max_length
)
assert
len
(
token_type_ids
)
==
max_length
,
"Error with input length {} vs {}"
.
format
(
len
(
token_type_ids
),
max_length
)
if
output_mode
==
"classification"
:
label
=
label_map
[
example
.
label
]
elif
output_mode
==
"regression"
:
label
=
float
(
example
.
label
)
else
:
raise
KeyError
(
output_mode
)
if
ex_index
<
5
:
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"attention_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
attention_mask
]))
logger
.
info
(
"token_type_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
token_type_ids
]))
logger
.
info
(
"label: %s (id = %d)"
%
(
example
.
label
,
label
))
features
.
append
(
InputFeatures
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
label
=
label
))
if
is_tf_available
()
and
is_tf_dataset
:
def
gen
():
for
ex
in
features
:
yield
({
'input_ids'
:
ex
.
input_ids
,
'attention_mask'
:
ex
.
attention_mask
,
'token_type_ids'
:
ex
.
token_type_ids
},
ex
.
label
)
return
tf
.
data
.
Dataset
.
from_generator
(
gen
,
({
'input_ids'
:
tf
.
int32
,
'attention_mask'
:
tf
.
int32
,
'token_type_ids'
:
tf
.
int32
},
tf
.
int64
),
({
'input_ids'
:
tf
.
TensorShape
([
None
]),
'attention_mask'
:
tf
.
TensorShape
([
None
]),
'token_type_ids'
:
tf
.
TensorShape
([
None
])},
tf
.
TensorShape
([])))
return
features
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment