Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
dc8e0019
Commit
dc8e0019
authored
Jun 19, 2019
by
thomwolf
Browse files
updating examples
parent
68ab9599
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
212 additions
and
42 deletions
+212
-42
README.md
README.md
+23
-0
examples/bertology.py
examples/bertology.py
+182
-20
examples/run_classifier.py
examples/run_classifier.py
+1
-1
pytorch_pretrained_bert/modeling.py
pytorch_pretrained_bert/modeling.py
+0
-21
pytorch_pretrained_bert/tokenization.py
pytorch_pretrained_bert/tokenization.py
+6
-0
No files found.
README.md
View file @
dc8e0019
...
@@ -1288,6 +1288,29 @@ Training with these hyper-parameters gave us the following results:
...
@@ -1288,6 +1288,29 @@ Training with these hyper-parameters gave us the following results:
loss
=
0.07231863956341798
loss
=
0.07231863956341798
```
```
Here is an example on MNLI:
```
bash
python
-m
torch.distributed.launch
--nproc_per_node
8 run_classifier.py
--bert_model
bert-large-uncased-whole-word-masking
--task_name
mnli
--do_train
--do_eval
--do_lower_case
--data_dir
/datadrive/bert_data/glue_data//MNLI/
--max_seq_length
128
--train_batch_size
8
--learning_rate
2e-5
--num_train_epochs
3.0
--output_dir
../models/wwm-uncased-finetuned-mnli/
--overwrite_output_dir
```
```
bash
*****
Eval results
*****
acc
=
0.8679706601466992
eval_loss
=
0.4911287787382479
global_step
=
18408
loss
=
0.04755385363816904
*****
Eval results
*****
acc
=
0.8747965825874695
eval_loss
=
0.45516540421714036
global_step
=
18408
loss
=
0.04755385363816904
```
This is the example of the
`bert-large-uncased-whole-word-masking-finetuned-mnli`
model
#### SQuAD
#### SQuAD
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
...
...
examples/bertology.py
View file @
dc8e0019
#!/usr/bin/env python3
#!/usr/bin/env python3
import
os
import
argparse
import
argparse
import
logging
import
logging
from
tqdm
import
t
range
from
tqdm
import
t
qdm
import
torch
import
torch.nn.functional
as
F
import
numpy
as
np
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
,
SequentialSampler
,
TensorDataset
,
Subset
from
torch.utils.data.distributed
import
DistributedSampler
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
pytorch_pretrained_bert
import
BertForSequenceClassification
,
BertTokenizer
from
pytorch_pretrained_bert
import
BertForSequenceClassification
,
BertTokenizer
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
from
run_classifier_dataset_utils
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
def
entropy
(
p
):
plogp
=
p
*
torch
.
log
(
p
)
plogp
[
p
==
0
]
=
0
return
-
plogp
.
sum
(
dim
=-
1
)
def
print_1d_tensor
(
tensor
,
prefix
=
""
):
if
tensor
.
dtype
!=
torch
.
long
:
logger
.
info
(
prefix
+
"
\t
"
.
join
(
f
"
{
x
:.
5
f
}
"
for
x
in
tensor
.
cpu
().
data
))
else
:
logger
.
info
(
prefix
+
"
\t
"
.
join
(
f
"
{
x
:
d
}
"
for
x
in
tensor
.
cpu
().
data
))
def
print_2d_tensor
(
tensor
):
logger
.
info
(
"lv, h >
\t
"
+
"
\t
"
.
join
(
f
"
{
x
+
1
}
"
for
x
in
range
(
len
(
tensor
))))
for
row
in
range
(
len
(
tensor
)):
print_1d_tensor
(
tensor
[
row
],
prefix
=
f
"layer
{
row
+
1
}
:
\t
"
)
def
compute_heads_importance
(
args
,
model
,
eval_dataloader
):
""" Example on how to use model outputs to compute:
- head attention entropy (activated by setting output_attentions=True when we created the model
- head importance scores according to http://arxiv.org/abs/1905.10650
(activated by setting keep_multihead_output=True when we created the model)
"""
for
step
,
batch
in
enumerate
(
tqdm
(
eval_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
# Do a forward pass
all_attentions
,
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
)
# Update head attention entropy
for
layer
,
attn
in
enumerate
(
all_attentions
):
masked_entropy
=
entropy
(
attn
.
detach
())
*
input_mask
.
float
().
unsqueeze
(
1
)
attn_entropy
[
layer
]
+=
masked_entropy
.
sum
(
-
1
).
sum
(
0
).
detach
()
# Update head importance scores with regards to our loss
# First backpropagate to populate the gradients
if
output_mode
==
"classification"
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
loss
.
backward
()
# Second compute importance scores according to http://arxiv.org/abs/1905.10650
multihead_outputs
=
model
.
bert
.
get_multihead_outputs
()
for
layer
,
mh_layer_output
in
enumerate
(
multihead_outputs
):
dot
=
torch
.
einsum
(
"bhli,bhli->bhl"
,
[
mh_layer_output
.
grad
,
mh_layer_output
])
head_importance
[
layer
]
+=
dot
.
abs
().
sum
(
-
1
).
sum
(
0
).
detach
()
tot_tokens
+=
input_mask
.
float
().
detach
().
sum
().
data
# Normalize
attn_entropy
/=
tot_tokens
head_importance
/=
tot_tokens
if
args
.
normalize_importance
:
head_importance
=
(
head_importance
-
head_importance
.
min
())
/
(
head_importance
.
max
()
-
head_importance
.
min
())
return
attn_entropy
,
head_importance
def
run_model
():
def
run_model
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model_name_or_path'
,
type
=
str
,
default
=
'bert-base-uncased'
,
help
=
'pretrained model name or path to local checkpoint'
)
parser
.
add_argument
(
'--model_name_or_path'
,
type
=
str
,
default
=
'bert-base-cased-finetuned-mrpc'
,
help
=
'pretrained model name or path to local checkpoint'
)
parser
.
add_argument
(
"--task_name"
,
type
=
str
,
default
=
'mrpc'
,
help
=
"The name of the task to train."
)
parser
.
add_argument
(
"--data_dir"
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
)
parser
.
add_argument
(
"--data_subset"
,
type
=
int
,
default
=-
1
,
help
=
"If > 0: limit the data to a subset of data_subset instances."
)
parser
.
add_argument
(
"--overwrite_output_dir"
,
action
=
'store_true'
,
help
=
"Whether to overwrite data in output directory"
)
parser
.
add_argument
(
"--normalize_importance"
,
action
=
'store_true'
,
help
=
"Whether to normalize importance score between 0 and 1"
)
parser
.
add_argument
(
"--try_pruning"
,
action
=
'store_true'
,
help
=
"Whether to try to prune head until a threshold of accuracy."
)
parser
.
add_argument
(
"--pruning_threshold"
,
default
=
0.9
,
type
=
float
,
help
=
"Pruning threshold of accuracy."
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
"--batch_size"
,
default
=
1
,
type
=
int
,
help
=
"Batch size."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
np
.
random
.
seed
(
args
.
seed
)
# Setup devices and distributed training
torch
.
random
.
manual_seed
(
args
.
seed
)
torch
.
cuda
.
manual_seed
(
args
.
seed
)
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
args
.
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
args
.
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
n_gpu
=
torch
.
cuda
.
device_count
()
...
@@ -34,21 +110,107 @@ def run_model():
...
@@ -34,21 +110,107 @@ def run_model():
torch
.
cuda
.
set_device
(
args
.
local_rank
)
torch
.
cuda
.
set_device
(
args
.
local_rank
)
args
.
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
args
.
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
# Initializes the distributed backend
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
# Setup logging
logging
.
basicConfig
(
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logging
.
basicConfig
(
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
info
(
"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
.
format
(
logger
.
info
(
"device: {} n_gpu: {}, distributed: {}"
.
format
(
args
.
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
)))
args
.
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
))
# Set seeds
np
.
random
.
seed
(
args
.
seed
)
torch
.
random
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed
(
args
.
seed
)
# Prepare GLUE task
task_name
=
args
.
task_name
.
lower
()
processor
=
processors
[
task_name
]()
output_mode
=
output_modes
[
task_name
]
label_list
=
processor
.
get_labels
()
num_labels
=
len
(
label_list
)
# Prepare output directory
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
)
and
args
.
local_rank
in
[
-
1
,
0
]:
os
.
makedirs
(
args
.
output_dir
)
# Load model & tokenizer
if
args
.
local_rank
not
in
[
-
1
,
0
]:
torch
.
distributed
.
barrier
()
# Make sure only one distributed process download model & vocab
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
model_name_or_path
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
model_name_or_path
)
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
model_name_or_path
)
# Load a model with all BERTology options on:
# output_attentions => will output attention weights
# keep_multihead_output => will store gradient of attention head outputs for head importance computation
# see: http://arxiv.org/abs/1905.10650
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
model_name_or_path
,
num_labels
=
num_labels
,
output_attentions
=
True
,
keep_multihead_output
=
True
)
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
# Make sure only one distributed process download model & vocab
model
.
to
(
args
.
device
)
model
.
to
(
args
.
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
model
.
eval
()
model
.
eval
()
# Prepare dataset for the GLUE task
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
cached_eval_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'dev_{0}_{1}_{2}'
.
format
(
list
(
filter
(
None
,
args
.
model_name_or_path
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task_name
)))
try
:
eval_features
=
torch
.
load
(
cached_eval_features_file
)
except
:
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving eval features to cache file %s"
,
cached_eval_features_file
)
torch
.
save
(
eval_features
,
cached_eval_features_file
)
if
__name__
==
'__main__'
:
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
run_model
()
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
if
output_mode
==
"classification"
else
torch
.
float
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
if
args
.
data_subset
>
0
:
eval_data
=
Subset
(
eval_data
,
list
(
range
(
args
.
data_subset
)))
eval_sampler
=
SequentialSampler
(
eval_data
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
batch_size
)
# Print/save training arguments
print
(
args
)
torch
.
save
(
args
,
os
.
path
.
join
(
args
.
output_dir
,
'run_args.bin'
))
# To showcase some BERTology methods, we will compute:
# - the average entropy of each head over the dev set
# - the importance score of each head over the dev set as explained in http://arxiv.org/abs/1905.10650
n_layers
,
n_heads
=
model
.
bert
.
config
.
num_hidden_layers
,
model
.
bert
.
config
.
num_attention_heads
head_importance
=
torch
.
zeros
(
n_layers
,
n_heads
).
to
(
args
.
device
)
attn_entropy
=
torch
.
zeros
(
n_layers
,
n_heads
).
to
(
args
.
device
)
tot_tokens
=
0.0
# Compute head entropy and importance score
attn_entropy
,
head_importance
=
compute_heads_importance
(
args
,
model
,
eval_dataloader
)
# Print/save matrices
np
.
save
(
os
.
path
.
join
(
args
.
output_dir
,
'attn_entropy.npy'
),
attn_entropy
)
np
.
save
(
os
.
path
.
join
(
args
.
output_dir
,
'head_importance.npy'
),
head_importance
)
logger
.
info
(
"Attention entropies"
)
print_2d_tensor
(
attn_entropy
)
logger
.
info
(
"Head importance scores"
)
print_2d_tensor
(
head_importance
)
logger
.
info
(
"Head ranked by importance scores"
)
head_ranks
=
torch
.
zeros
(
n_layers
*
n_heads
,
dtype
=
torch
.
long
,
device
=
args
.
device
)
head_ranks
[
head_importance
.
view
(
-
1
).
sort
(
descending
=
True
)[
1
]]
=
torch
.
arange
(
head_importance
.
numel
())
print_2d_tensor
(
head_ranks
.
view_as
(
head_importance
))
# Do pruning if we want to
if
args
.
try_pruning
and
args
.
pruning_threshold
>
0.0
and
args
.
pruning_threshold
<
1.0
:
if
__name__
==
'__main__'
:
run_model
()
examples/run_classifier.py
View file @
dc8e0019
...
@@ -366,7 +366,7 @@ def main():
...
@@ -366,7 +366,7 @@ def main():
output_args_file
=
os
.
path
.
join
(
args
.
output_dir
,
'training_args.bin'
)
output_args_file
=
os
.
path
.
join
(
args
.
output_dir
,
'training_args.bin'
)
torch
.
save
(
args
,
output_args_file
)
torch
.
save
(
args
,
output_args_file
)
else
:
else
:
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
bert_model
)
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
bert_model
,
num_labels
=
num_labels
)
model
.
to
(
device
)
model
.
to
(
device
)
...
...
pytorch_pretrained_bert/modeling.py
View file @
dc8e0019
...
@@ -707,36 +707,15 @@ class BertPreTrainedModel(nn.Module):
...
@@ -707,36 +707,15 @@ class BertPreTrainedModel(nn.Module):
archive_file
,
resolved_archive_file
))
archive_file
,
resolved_archive_file
))
logger
.
info
(
"loading configuration file {} from cache at {}"
.
format
(
logger
.
info
(
"loading configuration file {} from cache at {}"
.
format
(
config_file
,
resolved_config_file
))
config_file
,
resolved_config_file
))
### Switching to split config/weight files configuration
# tempdir = None
# if os.path.isdir(resolved_archive_file) or from_tf:
# serialization_dir = resolved_archive_file
# else:
# # Extract archive to temp dir
# tempdir = tempfile.mkdtemp()
# logger.info("extracting archive file {} to temp dir {}".format(
# resolved_archive_file, tempdir))
# with tarfile.open(resolved_archive_file, 'r:gz') as archive:
# archive.extractall(tempdir)
# serialization_dir = tempdir
# config_file = os.path.join(serialization_dir, CONFIG_NAME)
# if not os.path.exists(config_file):
# # Backward compatibility with old naming format
# config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
# Load config
# Load config
config
=
BertConfig
.
from_json_file
(
resolved_config_file
)
config
=
BertConfig
.
from_json_file
(
resolved_config_file
)
logger
.
info
(
"Model config {}"
.
format
(
config
))
logger
.
info
(
"Model config {}"
.
format
(
config
))
# Instantiate model.
# Instantiate model.
model
=
cls
(
config
,
*
inputs
,
**
kwargs
)
model
=
cls
(
config
,
*
inputs
,
**
kwargs
)
if
state_dict
is
None
and
not
from_tf
:
if
state_dict
is
None
and
not
from_tf
:
# weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
state_dict
=
torch
.
load
(
resolved_archive_file
,
map_location
=
'cpu'
)
state_dict
=
torch
.
load
(
resolved_archive_file
,
map_location
=
'cpu'
)
# if tempdir:
# # Clean up temp dir
# shutil.rmtree(tempdir)
if
from_tf
:
if
from_tf
:
# Directly load from a TensorFlow checkpoint
# Directly load from a TensorFlow checkpoint
# weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
return
load_tf_weights_in_bert
(
model
,
weights_path
)
return
load_tf_weights_in_bert
(
model
,
weights_path
)
# Load from a PyTorch state_dict
# Load from a PyTorch state_dict
old_keys
=
[]
old_keys
=
[]
...
...
pytorch_pretrained_bert/tokenization.py
View file @
dc8e0019
...
@@ -37,6 +37,9 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
...
@@ -37,6 +37,9 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-german-cased'
:
"https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt"
,
'bert-base-german-cased'
:
"https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt"
,
'bert-large-uncased-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt"
,
'bert-large-uncased-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt"
,
'bert-large-cased-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt"
,
'bert-large-cased-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt"
,
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt"
,
}
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
=
{
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
=
{
'bert-base-uncased'
:
512
,
'bert-base-uncased'
:
512
,
...
@@ -49,6 +52,9 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
...
@@ -49,6 +52,9 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-german-cased'
:
512
,
'bert-base-german-cased'
:
512
,
'bert-large-uncased-whole-word-masking'
:
512
,
'bert-large-uncased-whole-word-masking'
:
512
,
'bert-large-cased-whole-word-masking'
:
512
,
'bert-large-cased-whole-word-masking'
:
512
,
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
512
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
512
,
'bert-base-cased-finetuned-mrpc'
:
512
,
}
}
VOCAB_NAME
=
'vocab.txt'
VOCAB_NAME
=
'vocab.txt'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment