Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
dc8e0019
Commit
dc8e0019
authored
Jun 19, 2019
by
thomwolf
Browse files
updating examples
parent
68ab9599
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
212 additions
and
42 deletions
+212
-42
README.md
README.md
+23
-0
examples/bertology.py
examples/bertology.py
+182
-20
examples/run_classifier.py
examples/run_classifier.py
+1
-1
pytorch_pretrained_bert/modeling.py
pytorch_pretrained_bert/modeling.py
+0
-21
pytorch_pretrained_bert/tokenization.py
pytorch_pretrained_bert/tokenization.py
+6
-0
No files found.
README.md
View file @
dc8e0019
...
@@ -1288,6 +1288,29 @@ Training with these hyper-parameters gave us the following results:
...
@@ -1288,6 +1288,29 @@ Training with these hyper-parameters gave us the following results:
loss
=
0.07231863956341798
loss
=
0.07231863956341798
```
```
Here is an example on MNLI:
```
bash
python
-m
torch.distributed.launch
--nproc_per_node
8 run_classifier.py
--bert_model
bert-large-uncased-whole-word-masking
--task_name
mnli
--do_train
--do_eval
--do_lower_case
--data_dir
/datadrive/bert_data/glue_data//MNLI/
--max_seq_length
128
--train_batch_size
8
--learning_rate
2e-5
--num_train_epochs
3.0
--output_dir
../models/wwm-uncased-finetuned-mnli/
--overwrite_output_dir
```
```
bash
*****
Eval results
*****
acc
=
0.8679706601466992
eval_loss
=
0.4911287787382479
global_step
=
18408
loss
=
0.04755385363816904
*****
Eval results
*****
acc
=
0.8747965825874695
eval_loss
=
0.45516540421714036
global_step
=
18408
loss
=
0.04755385363816904
```
This is the example of the
`bert-large-uncased-whole-word-masking-finetuned-mnli`
model
#### SQuAD
#### SQuAD
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
...
...
examples/bertology.py
View file @
dc8e0019
#!/usr/bin/env python3
#!/usr/bin/env python3
import
os
import
argparse
import
argparse
import
logging
import
logging
from
tqdm
import
t
range
from
tqdm
import
t
qdm
import
torch
import
torch.nn.functional
as
F
import
numpy
as
np
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
,
SequentialSampler
,
TensorDataset
,
Subset
from
torch.utils.data.distributed
import
DistributedSampler
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
pytorch_pretrained_bert
import
BertForSequenceClassification
,
BertTokenizer
from
pytorch_pretrained_bert
import
BertForSequenceClassification
,
BertTokenizer
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
from
run_classifier_dataset_utils
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
def
entropy
(
p
):
plogp
=
p
*
torch
.
log
(
p
)
plogp
[
p
==
0
]
=
0
return
-
plogp
.
sum
(
dim
=-
1
)
def
print_1d_tensor
(
tensor
,
prefix
=
""
):
if
tensor
.
dtype
!=
torch
.
long
:
logger
.
info
(
prefix
+
"
\t
"
.
join
(
f
"
{
x
:.
5
f
}
"
for
x
in
tensor
.
cpu
().
data
))
else
:
logger
.
info
(
prefix
+
"
\t
"
.
join
(
f
"
{
x
:
d
}
"
for
x
in
tensor
.
cpu
().
data
))
def
print_2d_tensor
(
tensor
):
logger
.
info
(
"lv, h >
\t
"
+
"
\t
"
.
join
(
f
"
{
x
+
1
}
"
for
x
in
range
(
len
(
tensor
))))
for
row
in
range
(
len
(
tensor
)):
print_1d_tensor
(
tensor
[
row
],
prefix
=
f
"layer
{
row
+
1
}
:
\t
"
)
def
compute_heads_importance
(
args
,
model
,
eval_dataloader
):
""" Example on how to use model outputs to compute:
- head attention entropy (activated by setting output_attentions=True when we created the model
- head importance scores according to http://arxiv.org/abs/1905.10650
(activated by setting keep_multihead_output=True when we created the model)
"""
for
step
,
batch
in
enumerate
(
tqdm
(
eval_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
# Do a forward pass
all_attentions
,
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
)
# Update head attention entropy
for
layer
,
attn
in
enumerate
(
all_attentions
):
masked_entropy
=
entropy
(
attn
.
detach
())
*
input_mask
.
float
().
unsqueeze
(
1
)
attn_entropy
[
layer
]
+=
masked_entropy
.
sum
(
-
1
).
sum
(
0
).
detach
()
# Update head importance scores with regards to our loss
# First backpropagate to populate the gradients
if
output_mode
==
"classification"
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
loss
.
backward
()
# Second compute importance scores according to http://arxiv.org/abs/1905.10650
multihead_outputs
=
model
.
bert
.
get_multihead_outputs
()
for
layer
,
mh_layer_output
in
enumerate
(
multihead_outputs
):
dot
=
torch
.
einsum
(
"bhli,bhli->bhl"
,
[
mh_layer_output
.
grad
,
mh_layer_output
])
head_importance
[
layer
]
+=
dot
.
abs
().
sum
(
-
1
).
sum
(
0
).
detach
()
tot_tokens
+=
input_mask
.
float
().
detach
().
sum
().
data
# Normalize
attn_entropy
/=
tot_tokens
head_importance
/=
tot_tokens
if
args
.
normalize_importance
:
head_importance
=
(
head_importance
-
head_importance
.
min
())
/
(
head_importance
.
max
()
-
head_importance
.
min
())
return
attn_entropy
,
head_importance
def
run_model
():
def
run_model
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model_name_or_path'
,
type
=
str
,
default
=
'bert-base-uncased'
,
help
=
'pretrained model name or path to local checkpoint'
)
parser
.
add_argument
(
'--model_name_or_path'
,
type
=
str
,
default
=
'bert-base-cased-finetuned-mrpc'
,
help
=
'pretrained model name or path to local checkpoint'
)
parser
.
add_argument
(
"--task_name"
,
type
=
str
,
default
=
'mrpc'
,
help
=
"The name of the task to train."
)
parser
.
add_argument
(
"--data_dir"
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
)
parser
.
add_argument
(
"--data_subset"
,
type
=
int
,
default
=-
1
,
help
=
"If > 0: limit the data to a subset of data_subset instances."
)
parser
.
add_argument
(
"--overwrite_output_dir"
,
action
=
'store_true'
,
help
=
"Whether to overwrite data in output directory"
)
parser
.
add_argument
(
"--normalize_importance"
,
action
=
'store_true'
,
help
=
"Whether to normalize importance score between 0 and 1"
)
parser
.
add_argument
(
"--try_pruning"
,
action
=
'store_true'
,
help
=
"Whether to try to prune head until a threshold of accuracy."
)
parser
.
add_argument
(
"--pruning_threshold"
,
default
=
0.9
,
type
=
float
,
help
=
"Pruning threshold of accuracy."
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
"--batch_size"
,
default
=
1
,
type
=
int
,
help
=
"Batch size."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
np
.
random
.
seed
(
args
.
seed
)
# Setup devices and distributed training
torch
.
random
.
manual_seed
(
args
.
seed
)
torch
.
cuda
.
manual_seed
(
args
.
seed
)
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
args
.
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
args
.
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
n_gpu
=
torch
.
cuda
.
device_count
()
...
@@ -34,21 +110,107 @@ def run_model():
...
@@ -34,21 +110,107 @@ def run_model():
torch
.
cuda
.
set_device
(
args
.
local_rank
)
torch
.
cuda
.
set_device
(
args
.
local_rank
)
args
.
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
args
.
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
# Initializes the distributed backend
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
# Setup logging
logging
.
basicConfig
(
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logging
.
basicConfig
(
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
info
(
"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
.
format
(
logger
.
info
(
"device: {} n_gpu: {}, distributed: {}"
.
format
(
args
.
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
)))
args
.
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
))
# Set seeds
np
.
random
.
seed
(
args
.
seed
)
torch
.
random
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed
(
args
.
seed
)
# Prepare GLUE task
task_name
=
args
.
task_name
.
lower
()
processor
=
processors
[
task_name
]()
output_mode
=
output_modes
[
task_name
]
label_list
=
processor
.
get_labels
()
num_labels
=
len
(
label_list
)
# Prepare output directory
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
)
and
args
.
local_rank
in
[
-
1
,
0
]:
os
.
makedirs
(
args
.
output_dir
)
# Load model & tokenizer
if
args
.
local_rank
not
in
[
-
1
,
0
]:
torch
.
distributed
.
barrier
()
# Make sure only one distributed process download model & vocab
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
model_name_or_path
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
model_name_or_path
)
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
model_name_or_path
)
# Load a model with all BERTology options on:
# output_attentions => will output attention weights
# keep_multihead_output => will store gradient of attention head outputs for head importance computation
# see: http://arxiv.org/abs/1905.10650
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
model_name_or_path
,
num_labels
=
num_labels
,
output_attentions
=
True
,
keep_multihead_output
=
True
)
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
# Make sure only one distributed process download model & vocab
model
.
to
(
args
.
device
)
model
.
to
(
args
.
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
model
.
eval
()
model
.
eval
()
# Prepare dataset for the GLUE task
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
cached_eval_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'dev_{0}_{1}_{2}'
.
format
(
list
(
filter
(
None
,
args
.
model_name_or_path
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task_name
)))
try
:
eval_features
=
torch
.
load
(
cached_eval_features_file
)
except
:
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving eval features to cache file %s"
,
cached_eval_features_file
)
torch
.
save
(
eval_features
,
cached_eval_features_file
)
if
__name__
==
'__main__'
:
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
run_model
()
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
if
output_mode
==
"classification"
else
torch
.
float
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
if
args
.
data_subset
>
0
:
eval_data
=
Subset
(
eval_data
,
list
(
range
(
args
.
data_subset
)))
eval_sampler
=
SequentialSampler
(
eval_data
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
batch_size
)
# Print/save training arguments
print
(
args
)
torch
.
save
(
args
,
os
.
path
.
join
(
args
.
output_dir
,
'run_args.bin'
))
# To showcase some BERTology methods, we will compute:
# - the average entropy of each head over the dev set
# - the importance score of each head over the dev set as explained in http://arxiv.org/abs/1905.10650
n_layers
,
n_heads
=
model
.
bert
.
config
.
num_hidden_layers
,
model
.
bert
.
config
.
num_attention_heads
head_importance
=
torch
.
zeros
(
n_layers
,
n_heads
).
to
(
args
.
device
)
attn_entropy
=
torch
.
zeros
(
n_layers
,
n_heads
).
to
(
args
.
device
)
tot_tokens
=
0.0
# Compute head entropy and importance score
attn_entropy
,
head_importance
=
compute_heads_importance
(
args
,
model
,
eval_dataloader
)
# Print/save matrices
np
.
save
(
os
.
path
.
join
(
args
.
output_dir
,
'attn_entropy.npy'
),
attn_entropy
)
np
.
save
(
os
.
path
.
join
(
args
.
output_dir
,
'head_importance.npy'
),
head_importance
)
logger
.
info
(
"Attention entropies"
)
print_2d_tensor
(
attn_entropy
)
logger
.
info
(
"Head importance scores"
)
print_2d_tensor
(
head_importance
)
logger
.
info
(
"Head ranked by importance scores"
)
head_ranks
=
torch
.
zeros
(
n_layers
*
n_heads
,
dtype
=
torch
.
long
,
device
=
args
.
device
)
head_ranks
[
head_importance
.
view
(
-
1
).
sort
(
descending
=
True
)[
1
]]
=
torch
.
arange
(
head_importance
.
numel
())
print_2d_tensor
(
head_ranks
.
view_as
(
head_importance
))
# Do pruning if we want to
if
args
.
try_pruning
and
args
.
pruning_threshold
>
0.0
and
args
.
pruning_threshold
<
1.0
:
if
__name__
==
'__main__'
:
run_model
()
examples/run_classifier.py
View file @
dc8e0019
...
@@ -366,7 +366,7 @@ def main():
...
@@ -366,7 +366,7 @@ def main():
output_args_file
=
os
.
path
.
join
(
args
.
output_dir
,
'training_args.bin'
)
output_args_file
=
os
.
path
.
join
(
args
.
output_dir
,
'training_args.bin'
)
torch
.
save
(
args
,
output_args_file
)
torch
.
save
(
args
,
output_args_file
)
else
:
else
:
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
bert_model
)
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
bert_model
,
num_labels
=
num_labels
)
model
.
to
(
device
)
model
.
to
(
device
)
...
...
pytorch_pretrained_bert/modeling.py
View file @
dc8e0019
...
@@ -707,36 +707,15 @@ class BertPreTrainedModel(nn.Module):
...
@@ -707,36 +707,15 @@ class BertPreTrainedModel(nn.Module):
archive_file
,
resolved_archive_file
))
archive_file
,
resolved_archive_file
))
logger
.
info
(
"loading configuration file {} from cache at {}"
.
format
(
logger
.
info
(
"loading configuration file {} from cache at {}"
.
format
(
config_file
,
resolved_config_file
))
config_file
,
resolved_config_file
))
### Switching to split config/weight files configuration
# tempdir = None
# if os.path.isdir(resolved_archive_file) or from_tf:
# serialization_dir = resolved_archive_file
# else:
# # Extract archive to temp dir
# tempdir = tempfile.mkdtemp()
# logger.info("extracting archive file {} to temp dir {}".format(
# resolved_archive_file, tempdir))
# with tarfile.open(resolved_archive_file, 'r:gz') as archive:
# archive.extractall(tempdir)
# serialization_dir = tempdir
# config_file = os.path.join(serialization_dir, CONFIG_NAME)
# if not os.path.exists(config_file):
# # Backward compatibility with old naming format
# config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
# Load config
# Load config
config
=
BertConfig
.
from_json_file
(
resolved_config_file
)
config
=
BertConfig
.
from_json_file
(
resolved_config_file
)
logger
.
info
(
"Model config {}"
.
format
(
config
))
logger
.
info
(
"Model config {}"
.
format
(
config
))
# Instantiate model.
# Instantiate model.
model
=
cls
(
config
,
*
inputs
,
**
kwargs
)
model
=
cls
(
config
,
*
inputs
,
**
kwargs
)
if
state_dict
is
None
and
not
from_tf
:
if
state_dict
is
None
and
not
from_tf
:
# weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
state_dict
=
torch
.
load
(
resolved_archive_file
,
map_location
=
'cpu'
)
state_dict
=
torch
.
load
(
resolved_archive_file
,
map_location
=
'cpu'
)
# if tempdir:
# # Clean up temp dir
# shutil.rmtree(tempdir)
if
from_tf
:
if
from_tf
:
# Directly load from a TensorFlow checkpoint
# Directly load from a TensorFlow checkpoint
# weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
return
load_tf_weights_in_bert
(
model
,
weights_path
)
return
load_tf_weights_in_bert
(
model
,
weights_path
)
# Load from a PyTorch state_dict
# Load from a PyTorch state_dict
old_keys
=
[]
old_keys
=
[]
...
...
pytorch_pretrained_bert/tokenization.py
View file @
dc8e0019
...
@@ -37,6 +37,9 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
...
@@ -37,6 +37,9 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-german-cased'
:
"https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt"
,
'bert-base-german-cased'
:
"https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt"
,
'bert-large-uncased-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt"
,
'bert-large-uncased-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt"
,
'bert-large-cased-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt"
,
'bert-large-cased-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt"
,
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt"
,
}
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
=
{
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
=
{
'bert-base-uncased'
:
512
,
'bert-base-uncased'
:
512
,
...
@@ -49,6 +52,9 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
...
@@ -49,6 +52,9 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-german-cased'
:
512
,
'bert-base-german-cased'
:
512
,
'bert-large-uncased-whole-word-masking'
:
512
,
'bert-large-uncased-whole-word-masking'
:
512
,
'bert-large-cased-whole-word-masking'
:
512
,
'bert-large-cased-whole-word-masking'
:
512
,
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
512
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
512
,
'bert-base-cased-finetuned-mrpc'
:
512
,
}
}
VOCAB_NAME
=
'vocab.txt'
VOCAB_NAME
=
'vocab.txt'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment