Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
3a9c8837
Commit
3a9c8837
authored
Jan 15, 2019
by
thomwolf
Browse files
adding Transformer XL
parent
e5c78c66
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
2396 additions
and
0 deletions
+2396
-0
pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
...etrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+125
-0
pytorch_pretrained_bert/modeling_openai.py
pytorch_pretrained_bert/modeling_openai.py
+17
-0
pytorch_pretrained_bert/modeling_transfo_xl.py
pytorch_pretrained_bert/modeling_transfo_xl.py
+1432
-0
pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+314
-0
pytorch_pretrained_bert/tokenization_transfo_xl.py
pytorch_pretrained_bert/tokenization_transfo_xl.py
+508
-0
No files found.
pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
0 → 100755
View file @
3a9c8837
# coding=utf-8
# Copyright 2018 The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert OpenAI GPT checkpoint."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
re
import
argparse
import
tensorflow
as
tf
import
torch
import
numpy
as
np
from
.modeling_transfo_xl
import
TransfoXLConfig
,
TransfoXLModel
,
CONFIG_NAME
,
WEIGHTS_NAME
def
convert_transfo_xl_checkpoint_to_pytorch
(
tf_checkpoint_path
,
transfo_xl_config_file
,
pytorch_dump_folder_path
):
config_path
=
os
.
path
.
abspath
(
transfo_xl_config_file
)
tf_path
=
os
.
path
.
abspath
(
tf_checkpoint_path
)
print
(
"Converting Transformer XL checkpoint from {} with config at {}"
.
format
(
tf_path
,
config_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
print
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
# Initialise PyTorch model
# Construct model
if
transfo_xl_config_file
==
""
:
config
=
TransfoXLConfig
()
else
:
config
=
TransfoXLConfig
(
transfo_xl_config_file
)
print
(
"Building PyTorch model from configuration: {}"
.
format
(
str
(
config
)))
model
=
TransfoXLModel
(
config
)
for
name
,
array
in
zip
(
names
,
arrays
):
name
=
name
.
split
(
'/'
)
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if
any
(
n
in
[
"adam_v"
,
"adam_m"
]
for
n
in
name
):
print
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+_\d+'
,
m_name
):
l
=
re
.
split
(
r
'_(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'kernel'
or
l
[
0
]
==
'gamma'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'output_bias'
or
l
[
0
]
==
'beta'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'output_weights'
:
pointer
=
getattr
(
pointer
,
'weight'
)
else
:
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
if
m_name
[
-
11
:]
==
'_embeddings'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
m_name
==
'kernel'
:
array
=
np
.
transpose
(
array
)
try
:
assert
pointer
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
array
.
shape
)
raise
print
(
"Initialize PyTorch weight {}"
.
format
(
name
))
pointer
.
data
=
torch
.
from_numpy
(
array
)
# Save pytorch-model
pytorch_weights_dump_path
=
pytorch_dump_folder_path
+
'/'
+
WEIGHTS_NAME
pytorch_config_dump_path
=
pytorch_dump_folder_path
+
'/'
+
CONFIG_NAME
print
(
"Save PyTorch model to {}"
.
format
(
pytorch_weights_dump_path
))
torch
.
save
(
model
.
state_dict
(),
pytorch_weights_dump_path
)
print
(
"Save configuration file to {}"
.
format
(
pytorch_config_dump_path
))
with
open
(
pytorch_config_dump_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
config
.
to_json_string
())
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--tf_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path the TensorFlow checkpoint path."
)
parser
.
add_argument
(
"--transfo_xl_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained BERT model.
\n
"
"This specifies the model architecture."
)
parser
.
add_argument
(
"--pytorch_dump_folder_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
)
args
=
parser
.
parse_args
()
convert_transfo_xl_checkpoint_to_pytorch
(
args
.
tf_checkpoint_path
,
args
.
transfo_xl_config_file
,
args
.
pytorch_dump_folder_path
)
pytorch_pretrained_bert/modeling_openai.py
View file @
3a9c8837
# coding=utf-8
# Copyright 2018 The OpenAI Team Authors and HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch OpenAI GPT model."""
import
os
import
os
import
copy
import
copy
import
json
import
json
...
...
pytorch_pretrained_bert/modeling_transfo_xl.py
0 → 100644
View file @
3a9c8837
This diff is collapsed.
Click to expand it.
pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
0 → 100644
View file @
3a9c8837
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Utilities for PyTorch Transformer XL model.
Directly adapted from https://github.com/kimiyoung/transformer-xl.
"""
from
collections
import
defaultdict
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
class
ProjectedAdaptiveLogSoftmax
(
nn
.
Module
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
):
super
(
ProjectedAdaptiveLogSoftmax
,
self
).
__init__
()
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_proj
=
d_proj
self
.
cutoffs
=
cutoffs
+
[
n_token
]
self
.
cutoff_ends
=
[
0
]
+
self
.
cutoffs
self
.
div_val
=
div_val
self
.
shortlist_size
=
self
.
cutoffs
[
0
]
self
.
n_clusters
=
len
(
self
.
cutoffs
)
-
1
self
.
head_size
=
self
.
shortlist_size
+
self
.
n_clusters
if
self
.
n_clusters
>
0
:
self
.
cluster_weight
=
nn
.
Parameter
(
torch
.
zeros
(
self
.
n_clusters
,
self
.
d_embed
))
self
.
cluster_bias
=
nn
.
Parameter
(
torch
.
zeros
(
self
.
n_clusters
))
self
.
out_layers
=
nn
.
ModuleList
()
self
.
out_projs
=
nn
.
ParameterList
()
if
div_val
==
1
:
for
i
in
range
(
len
(
self
.
cutoffs
)):
if
d_proj
!=
d_embed
:
self
.
out_projs
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
d_proj
,
d_embed
))
)
else
:
self
.
out_projs
.
append
(
None
)
self
.
out_layers
.
append
(
nn
.
Linear
(
d_embed
,
n_token
))
else
:
for
i
in
range
(
len
(
self
.
cutoffs
)):
l_idx
,
r_idx
=
self
.
cutoff_ends
[
i
],
self
.
cutoff_ends
[
i
+
1
]
d_emb_i
=
d_embed
//
(
div_val
**
i
)
self
.
out_projs
.
append
(
nn
.
Parameter
(
torch
.
Tensor
(
d_proj
,
d_emb_i
))
)
self
.
out_layers
.
append
(
nn
.
Linear
(
d_emb_i
,
r_idx
-
l_idx
))
self
.
keep_order
=
keep_order
def
_compute_logit
(
self
,
hidden
,
weight
,
bias
,
proj
):
if
proj
is
None
:
logit
=
F
.
linear
(
hidden
,
weight
,
bias
=
bias
)
else
:
# if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
proj_hid
=
F
.
linear
(
hidden
,
proj
.
t
().
contiguous
())
logit
=
F
.
linear
(
proj_hid
,
weight
,
bias
=
bias
)
# else:
# logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
# if bias is not None:
# logit = logit + bias
return
logit
def
forward
(
self
,
hidden
,
target
,
keep_order
=
False
):
'''
hidden :: [len*bsz x d_proj]
target :: [len*bsz]
'''
if
hidden
.
size
(
0
)
!=
target
.
size
(
0
):
raise
RuntimeError
(
'Input and target should have the same size '
'in the batch dimension.'
)
if
self
.
n_clusters
==
0
:
logit
=
self
.
_compute_logit
(
hidden
,
self
.
out_layers
[
0
].
weight
,
self
.
out_layers
[
0
].
bias
,
self
.
out_projs
[
0
])
nll
=
-
F
.
log_softmax
(
logit
,
dim
=-
1
)
\
.
gather
(
1
,
target
.
unsqueeze
(
1
)).
squeeze
(
1
)
else
:
# construct weights and biases
weights
,
biases
=
[],
[]
for
i
in
range
(
len
(
self
.
cutoffs
)):
if
self
.
div_val
==
1
:
l_idx
,
r_idx
=
self
.
cutoff_ends
[
i
],
self
.
cutoff_ends
[
i
+
1
]
weight_i
=
self
.
out_layers
[
0
].
weight
[
l_idx
:
r_idx
]
bias_i
=
self
.
out_layers
[
0
].
bias
[
l_idx
:
r_idx
]
else
:
weight_i
=
self
.
out_layers
[
i
].
weight
bias_i
=
self
.
out_layers
[
i
].
bias
if
i
==
0
:
weight_i
=
torch
.
cat
(
[
weight_i
,
self
.
cluster_weight
],
dim
=
0
)
bias_i
=
torch
.
cat
(
[
bias_i
,
self
.
cluster_bias
],
dim
=
0
)
weights
.
append
(
weight_i
)
biases
.
append
(
bias_i
)
head_weight
,
head_bias
,
head_proj
=
weights
[
0
],
biases
[
0
],
self
.
out_projs
[
0
]
head_logit
=
self
.
_compute_logit
(
hidden
,
head_weight
,
head_bias
,
head_proj
)
head_logprob
=
F
.
log_softmax
(
head_logit
,
dim
=
1
)
nll
=
torch
.
zeros_like
(
target
,
dtype
=
hidden
.
dtype
,
device
=
hidden
.
device
)
offset
=
0
cutoff_values
=
[
0
]
+
self
.
cutoffs
for
i
in
range
(
len
(
cutoff_values
)
-
1
):
l_idx
,
r_idx
=
cutoff_values
[
i
],
cutoff_values
[
i
+
1
]
mask_i
=
(
target
>=
l_idx
)
&
(
target
<
r_idx
)
indices_i
=
mask_i
.
nonzero
().
squeeze
()
if
indices_i
.
numel
()
==
0
:
continue
target_i
=
target
.
index_select
(
0
,
indices_i
)
-
l_idx
head_logprob_i
=
head_logprob
.
index_select
(
0
,
indices_i
)
if
i
==
0
:
logprob_i
=
head_logprob_i
.
gather
(
1
,
target_i
[:,
None
]).
squeeze
(
1
)
else
:
weight_i
,
bias_i
,
proj_i
=
weights
[
i
],
biases
[
i
],
self
.
out_projs
[
i
]
hidden_i
=
hidden
.
index_select
(
0
,
indices_i
)
tail_logit_i
=
self
.
_compute_logit
(
hidden_i
,
weight_i
,
bias_i
,
proj_i
)
tail_logprob_i
=
F
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
logprob_i
=
head_logprob_i
[:,
-
i
]
\
+
tail_logprob_i
.
gather
(
1
,
target_i
[:,
None
]).
squeeze
(
1
)
if
(
hasattr
(
self
,
'keep_order'
)
and
self
.
keep_order
)
or
keep_order
:
nll
.
index_copy_
(
0
,
indices_i
,
-
logprob_i
)
else
:
nll
[
offset
:
offset
+
logprob_i
.
size
(
0
)].
copy_
(
-
logprob_i
)
offset
+=
logprob_i
.
size
(
0
)
return
nll
class
LogUniformSampler
(
object
):
def
__init__
(
self
,
range_max
,
n_sample
):
"""
Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
`P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
expected count can be approximated by 1 - (1 - p)^n
and we use a numerically stable version -expm1(num_tries * log1p(-p))
Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
"""
with
torch
.
no_grad
():
self
.
range_max
=
range_max
log_indices
=
torch
.
arange
(
1.
,
range_max
+
2.
,
1.
).
log_
()
self
.
dist
=
(
log_indices
[
1
:]
-
log_indices
[:
-
1
])
/
log_indices
[
-
1
]
# print('P', self.dist.numpy().tolist()[-30:])
self
.
log_q
=
(
-
(
-
self
.
dist
.
double
().
log1p_
()
*
2
*
n_sample
).
expm1_
()).
log_
().
float
()
self
.
n_sample
=
n_sample
def
sample
(
self
,
labels
):
"""
labels: [b1, b2]
Return
true_log_probs: [b1, b2]
samp_log_probs: [n_sample]
neg_samples: [n_sample]
"""
# neg_samples = torch.empty(0).long()
n_sample
=
self
.
n_sample
n_tries
=
2
*
n_sample
with
torch
.
no_grad
():
neg_samples
=
torch
.
multinomial
(
self
.
dist
,
n_tries
,
replacement
=
True
).
unique
()
device
=
labels
.
device
neg_samples
=
neg_samples
.
to
(
device
)
true_log_probs
=
self
.
log_q
[
labels
].
to
(
device
)
samp_log_probs
=
self
.
log_q
[
neg_samples
].
to
(
device
)
return
true_log_probs
,
samp_log_probs
,
neg_samples
def
sample_logits
(
embedding
,
bias
,
labels
,
inputs
,
sampler
):
"""
embedding: an nn.Embedding layer
bias: [n_vocab]
labels: [b1, b2]
inputs: [b1, b2, n_emb]
sampler: you may use a LogUniformSampler
Return
logits: [b1, b2, 1 + n_sample]
"""
true_log_probs
,
samp_log_probs
,
neg_samples
=
sampler
.
sample
(
labels
)
n_sample
=
neg_samples
.
size
(
0
)
b1
,
b2
=
labels
.
size
(
0
),
labels
.
size
(
1
)
all_ids
=
torch
.
cat
([
labels
.
view
(
-
1
),
neg_samples
])
all_w
=
embedding
(
all_ids
)
true_w
=
all_w
[:
-
n_sample
].
view
(
b1
,
b2
,
-
1
)
sample_w
=
all_w
[
-
n_sample
:].
view
(
n_sample
,
-
1
)
all_b
=
bias
[
all_ids
]
true_b
=
all_b
[:
-
n_sample
].
view
(
b1
,
b2
)
sample_b
=
all_b
[
-
n_sample
:]
hit
=
(
labels
[:,
:,
None
]
==
neg_samples
).
detach
()
true_logits
=
torch
.
einsum
(
'ijk,ijk->ij'
,
[
true_w
,
inputs
])
+
true_b
-
true_log_probs
sample_logits
=
torch
.
einsum
(
'lk,ijk->ijl'
,
[
sample_w
,
inputs
])
+
sample_b
-
samp_log_probs
sample_logits
.
masked_fill_
(
hit
,
-
1e30
)
logits
=
torch
.
cat
([
true_logits
[:,
:,
None
],
sample_logits
],
-
1
)
return
logits
# class LogUniformSampler(object):
# def __init__(self, range_max, unique=False):
# """
# Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
# `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
# """
# self.range_max = range_max
# log_indices = torch.arange(1., range_max+2., 1.).log_()
# self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
# self.unique = unique
# if self.unique:
# self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
# def sample(self, n_sample, labels):
# pos_sample, new_labels = labels.unique(return_inverse=True)
# n_pos_sample = pos_sample.size(0)
# n_neg_sample = n_sample - n_pos_sample
# if self.unique:
# self.exclude_mask.index_fill_(0, pos_sample, 1)
# sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
# self.exclude_mask.index_fill_(0, pos_sample, 0)
# else:
# sample_dist = self.dist
# neg_sample = torch.multinomial(sample_dist, n_neg_sample)
# sample = torch.cat([pos_sample, neg_sample])
# sample_prob = self.dist[sample]
# return new_labels, sample, sample_prob
if
__name__
==
'__main__'
:
S
,
B
=
3
,
4
n_vocab
=
10000
n_sample
=
5
H
=
32
labels
=
torch
.
LongTensor
(
S
,
B
).
random_
(
0
,
n_vocab
)
# sampler = LogUniformSampler(n_vocab, unique=False)
# new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
sampler
=
LogUniformSampler
(
n_vocab
,
unique
=
True
)
# true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
# print('true_probs', true_probs.numpy().tolist())
# print('samp_probs', samp_probs.numpy().tolist())
# print('neg_samples', neg_samples.numpy().tolist())
# print('sum', torch.sum(sampler.dist).item())
# assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
embedding
=
nn
.
Embedding
(
n_vocab
,
H
)
bias
=
torch
.
zeros
(
n_vocab
)
inputs
=
torch
.
Tensor
(
S
,
B
,
H
).
normal_
()
logits
,
out_labels
=
sample_logits
(
embedding
,
bias
,
labels
,
inputs
,
sampler
,
n_sample
)
print
(
'logits'
,
logits
.
detach
().
numpy
().
tolist
())
print
(
'logits shape'
,
logits
.
size
())
print
(
'out_labels'
,
out_labels
.
detach
().
numpy
().
tolist
())
print
(
'out_labels shape'
,
out_labels
.
size
())
pytorch_pretrained_bert/tokenization_transfo_xl.py
0 → 100644
View file @
3a9c8837
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for Transformer XL model.
Directly adapted from https://github.com/kimiyoung/transformer-xl.
"""
import
os
import
re
import
json
from
tqdm
import
tqdm
import
logging
import
pickle
from
collections
import
Counter
,
OrderedDict
from
.file_utils
import
cached_path
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_VOCAB_ARCHIVE_MAP
=
{
'transfo-xl'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"
,
}
PRETRAINED_MERGES_ARCHIVE_MAP
=
{
'openai-gpt'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
,
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
=
{
'openai-gpt'
:
512
,
}
VOCAB_NAME
=
'vocab.json'
MERGES_NAME
=
'merges.txt'
class
TransfoXLTokenizer
(
object
):
"""
Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
"""
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
"""
Instantiate a TransfoXLTokenizer.
Download and cache the vocabulary if needed.
"""
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
vocab_file
=
PRETRAINED_VOCAB_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
merges_file
=
PRETRAINED_MERGES_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
else
:
vocab_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
VOCAB_NAME
)
merges_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
MERGES_NAME
)
# redirect to the cache, if necessary
try
:
resolved_vocab_file
=
cached_path
(
vocab_file
,
cache_dir
=
cache_dir
)
resolved_merges_file
=
cached_path
(
merges_file
,
cache_dir
=
cache_dir
)
except
FileNotFoundError
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
()),
pretrained_model_name_or_path
,
vocab_file
,
merges_file
))
return
None
if
resolved_vocab_file
==
vocab_file
and
resolved_merges_file
==
merges_file
:
logger
.
info
(
"loading vocabulary file {}"
.
format
(
vocab_file
))
logger
.
info
(
"loading merges file {}"
.
format
(
merges_file
))
else
:
logger
.
info
(
"loading vocabulary file {} from cache at {}"
.
format
(
vocab_file
,
resolved_vocab_file
))
logger
.
info
(
"loading merges file {} from cache at {}"
.
format
(
merges_file
,
resolved_merges_file
))
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len
=
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
[
pretrained_model_name_or_path
]
kwargs
[
'max_len'
]
=
min
(
kwargs
.
get
(
'max_len'
,
int
(
1e12
)),
max_len
)
# Instantiate tokenizer.
tokenizer
=
cls
(
resolved_vocab_file
,
resolved_merges_file
,
*
inputs
,
**
kwargs
)
return
tokenizer
def
__init__
(
self
,
special
=
[],
min_freq
=
0
,
max_size
=
None
,
lower_case
=
True
,
delimiter
=
None
,
vocab_file
=
None
):
self
.
counter
=
Counter
()
self
.
special
=
special
self
.
min_freq
=
min_freq
self
.
max_size
=
max_size
self
.
lower_case
=
lower_case
self
.
delimiter
=
delimiter
self
.
vocab_file
=
vocab_file
def
count_file
(
self
,
path
,
verbose
=
False
,
add_eos
=
False
):
if
verbose
:
print
(
'counting file {} ...'
.
format
(
path
))
assert
os
.
path
.
exists
(
path
)
sents
=
[]
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
idx
,
line
in
enumerate
(
f
):
if
verbose
and
idx
>
0
and
idx
%
500000
==
0
:
print
(
' line {}'
.
format
(
idx
))
symbols
=
self
.
tokenize
(
line
,
add_eos
=
add_eos
)
self
.
counter
.
update
(
symbols
)
sents
.
append
(
symbols
)
return
sents
def
count_sents
(
self
,
sents
,
verbose
=
False
):
"""
sents : a list of sentences, each a list of tokenized symbols
"""
if
verbose
:
print
(
'counting {} sents ...'
.
format
(
len
(
sents
)))
for
idx
,
symbols
in
enumerate
(
sents
):
if
verbose
and
idx
>
0
and
idx
%
500000
==
0
:
print
(
' line {}'
.
format
(
idx
))
self
.
counter
.
update
(
symbols
)
def
_build_from_file
(
self
,
vocab_file
):
self
.
idx2sym
=
[]
self
.
sym2idx
=
OrderedDict
()
with
open
(
vocab_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
symb
=
line
.
strip
().
split
()[
0
]
self
.
add_symbol
(
symb
)
self
.
unk_idx
=
self
.
sym2idx
[
'<UNK>'
]
def
build_vocab
(
self
):
if
self
.
vocab_file
:
print
(
'building vocab from {}'
.
format
(
self
.
vocab_file
))
self
.
_build_from_file
(
self
.
vocab_file
)
print
(
'final vocab size {}'
.
format
(
len
(
self
)))
else
:
print
(
'building vocab with min_freq={}, max_size={}'
.
format
(
self
.
min_freq
,
self
.
max_size
))
self
.
idx2sym
=
[]
self
.
sym2idx
=
OrderedDict
()
for
sym
in
self
.
special
:
self
.
add_special
(
sym
)
for
sym
,
cnt
in
self
.
counter
.
most_common
(
self
.
max_size
):
if
cnt
<
self
.
min_freq
:
break
self
.
add_symbol
(
sym
)
print
(
'final vocab size {} from {} unique tokens'
.
format
(
len
(
self
),
len
(
self
.
counter
)))
def
encode_file
(
self
,
path
,
ordered
=
False
,
verbose
=
False
,
add_eos
=
True
,
add_double_eos
=
False
):
if
verbose
:
print
(
'encoding file {} ...'
.
format
(
path
))
assert
os
.
path
.
exists
(
path
)
encoded
=
[]
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
idx
,
line
in
enumerate
(
f
):
if
verbose
and
idx
>
0
and
idx
%
500000
==
0
:
print
(
' line {}'
.
format
(
idx
))
symbols
=
self
.
tokenize
(
line
,
add_eos
=
add_eos
,
add_double_eos
=
add_double_eos
)
encoded
.
append
(
self
.
convert_to_tensor
(
symbols
))
if
ordered
:
encoded
=
torch
.
cat
(
encoded
)
return
encoded
def
encode_sents
(
self
,
sents
,
ordered
=
False
,
verbose
=
False
):
if
verbose
:
print
(
'encoding {} sents ...'
.
format
(
len
(
sents
)))
encoded
=
[]
for
idx
,
symbols
in
enumerate
(
sents
):
if
verbose
and
idx
>
0
and
idx
%
500000
==
0
:
print
(
' line {}'
.
format
(
idx
))
encoded
.
append
(
self
.
convert_to_tensor
(
symbols
))
if
ordered
:
encoded
=
torch
.
cat
(
encoded
)
return
encoded
def
add_special
(
self
,
sym
):
if
sym
not
in
self
.
sym2idx
:
self
.
idx2sym
.
append
(
sym
)
self
.
sym2idx
[
sym
]
=
len
(
self
.
idx2sym
)
-
1
setattr
(
self
,
'{}_idx'
.
format
(
sym
.
strip
(
'<>'
)),
self
.
sym2idx
[
sym
])
def
add_symbol
(
self
,
sym
):
if
sym
not
in
self
.
sym2idx
:
self
.
idx2sym
.
append
(
sym
)
self
.
sym2idx
[
sym
]
=
len
(
self
.
idx2sym
)
-
1
def
get_sym
(
self
,
idx
):
assert
0
<=
idx
<
len
(
self
),
'Index {} out of range'
.
format
(
idx
)
return
self
.
idx2sym
[
idx
]
def
get_idx
(
self
,
sym
):
if
sym
in
self
.
sym2idx
:
return
self
.
sym2idx
[
sym
]
else
:
# print('encounter unk {}'.format(sym))
assert
'<eos>'
not
in
sym
assert
hasattr
(
self
,
'unk_idx'
)
return
self
.
sym2idx
.
get
(
sym
,
self
.
unk_idx
)
def
convert_ids_to_tokens
(
self
,
indices
):
"""Converts a sequence of indices in symbols using the vocab."""
return
[
self
.
get_sym
(
idx
)
for
idx
in
indices
]
def
convert_tokens_to_ids
(
self
,
symbols
):
"""Converts a sequence of symbols into ids using the vocab."""
return
[
self
.
get_idx
(
sym
)
for
sym
in
symbols
]
def
convert_to_tensor
(
self
,
symbols
):
return
torch
.
LongTensor
(
self
.
convert_tokens_to_ids
(
symbols
))
def
decode
(
self
,
indices
,
exclude
=
None
):
"""Converts a sequence of indices in a string."""
if
exclude
is
None
:
return
' '
.
join
([
self
.
get_sym
(
idx
)
for
idx
in
indices
])
else
:
return
' '
.
join
([
self
.
get_sym
(
idx
)
for
idx
in
indices
if
idx
not
in
exclude
])
def
__len__
(
self
):
return
len
(
self
.
idx2sym
)
def
tokenize
(
self
,
line
,
add_eos
=
False
,
add_double_eos
=
False
):
line
=
line
.
strip
()
# convert to lower case
if
self
.
lower_case
:
line
=
line
.
lower
()
# empty delimiter '' will evaluate False
if
self
.
delimiter
==
''
:
symbols
=
line
else
:
symbols
=
line
.
split
(
self
.
delimiter
)
if
add_double_eos
:
# lm1b
return
[
'<S>'
]
+
symbols
+
[
'<S>'
]
elif
add_eos
:
return
symbols
+
[
'<eos>'
]
else
:
return
symbols
class
LMOrderedIterator
(
object
):
def
__init__
(
self
,
data
,
bsz
,
bptt
,
device
=
'cpu'
,
ext_len
=
None
):
"""
data -- LongTensor -- the LongTensor is strictly ordered
"""
self
.
bsz
=
bsz
self
.
bptt
=
bptt
self
.
ext_len
=
ext_len
if
ext_len
is
not
None
else
0
self
.
device
=
device
# Work out how cleanly we can divide the dataset into bsz parts.
self
.
n_step
=
data
.
size
(
0
)
//
bsz
# Trim off any extra elements that wouldn't cleanly fit (remainders).
data
=
data
.
narrow
(
0
,
0
,
self
.
n_step
*
bsz
)
# Evenly divide the data across the bsz batches.
self
.
data
=
data
.
view
(
bsz
,
-
1
).
t
().
contiguous
().
to
(
device
)
# Number of mini-batches
self
.
n_batch
=
(
self
.
n_step
+
self
.
bptt
-
1
)
//
self
.
bptt
def
get_batch
(
self
,
i
,
bptt
=
None
):
if
bptt
is
None
:
bptt
=
self
.
bptt
seq_len
=
min
(
bptt
,
self
.
data
.
size
(
0
)
-
1
-
i
)
end_idx
=
i
+
seq_len
beg_idx
=
max
(
0
,
i
-
self
.
ext_len
)
data
=
self
.
data
[
beg_idx
:
end_idx
]
target
=
self
.
data
[
i
+
1
:
i
+
1
+
seq_len
]
return
data
,
target
,
seq_len
def
get_fixlen_iter
(
self
,
start
=
0
):
for
i
in
range
(
start
,
self
.
data
.
size
(
0
)
-
1
,
self
.
bptt
):
yield
self
.
get_batch
(
i
)
def
get_varlen_iter
(
self
,
start
=
0
,
std
=
5
,
min_len
=
5
,
max_deviation
=
3
):
max_len
=
self
.
bptt
+
max_deviation
*
std
i
=
start
while
True
:
bptt
=
self
.
bptt
if
np
.
random
.
random
()
<
0.95
else
self
.
bptt
/
2.
bptt
=
min
(
max_len
,
max
(
min_len
,
int
(
np
.
random
.
normal
(
bptt
,
std
))))
data
,
target
,
seq_len
=
self
.
get_batch
(
i
,
bptt
)
i
+=
seq_len
yield
data
,
target
,
seq_len
if
i
>=
self
.
data
.
size
(
0
)
-
2
:
break
def
__iter__
(
self
):
return
self
.
get_fixlen_iter
()
class
LMShuffledIterator
(
object
):
def
__init__
(
self
,
data
,
bsz
,
bptt
,
device
=
'cpu'
,
ext_len
=
None
,
shuffle
=
False
):
"""
data -- list[LongTensor] -- there is no order among the LongTensors
"""
self
.
data
=
data
self
.
bsz
=
bsz
self
.
bptt
=
bptt
self
.
ext_len
=
ext_len
if
ext_len
is
not
None
else
0
self
.
device
=
device
self
.
shuffle
=
shuffle
def
get_sent_stream
(
self
):
# index iterator
epoch_indices
=
np
.
random
.
permutation
(
len
(
self
.
data
))
if
self
.
shuffle
\
else
np
.
array
(
range
(
len
(
self
.
data
)))
# sentence iterator
for
idx
in
epoch_indices
:
yield
self
.
data
[
idx
]
def
stream_iterator
(
self
,
sent_stream
):
# streams for each data in the batch
streams
=
[
None
]
*
self
.
bsz
data
=
torch
.
LongTensor
(
self
.
bptt
,
self
.
bsz
)
target
=
torch
.
LongTensor
(
self
.
bptt
,
self
.
bsz
)
n_retain
=
0
while
True
:
# data : [n_retain+bptt x bsz]
# target : [bptt x bsz]
data
[
n_retain
:].
fill_
(
-
1
)
target
.
fill_
(
-
1
)
valid_batch
=
True
for
i
in
range
(
self
.
bsz
):
n_filled
=
0
try
:
while
n_filled
<
self
.
bptt
:
if
streams
[
i
]
is
None
or
len
(
streams
[
i
])
<=
1
:
streams
[
i
]
=
next
(
sent_stream
)
# number of new tokens to fill in
n_new
=
min
(
len
(
streams
[
i
])
-
1
,
self
.
bptt
-
n_filled
)
# first n_retain tokens are retained from last batch
data
[
n_retain
+
n_filled
:
n_retain
+
n_filled
+
n_new
,
i
]
=
\
streams
[
i
][:
n_new
]
target
[
n_filled
:
n_filled
+
n_new
,
i
]
=
\
streams
[
i
][
1
:
n_new
+
1
]
streams
[
i
]
=
streams
[
i
][
n_new
:]
n_filled
+=
n_new
except
StopIteration
:
valid_batch
=
False
break
if
not
valid_batch
:
return
data
=
data
.
to
(
self
.
device
)
target
=
target
.
to
(
self
.
device
)
yield
data
,
target
,
self
.
bptt
n_retain
=
min
(
data
.
size
(
0
),
self
.
ext_len
)
if
n_retain
>
0
:
data
[:
n_retain
]
=
data
[
-
n_retain
:]
data
.
resize_
(
n_retain
+
self
.
bptt
,
data
.
size
(
1
))
def
__iter__
(
self
):
# sent_stream is an iterator
sent_stream
=
self
.
get_sent_stream
()
for
batch
in
self
.
stream_iterator
(
sent_stream
):
yield
batch
class
LMMultiFileIterator
(
LMShuffledIterator
):
def
__init__
(
self
,
paths
,
vocab
,
bsz
,
bptt
,
device
=
'cpu'
,
ext_len
=
None
,
shuffle
=
False
):
self
.
paths
=
paths
self
.
vocab
=
vocab
self
.
bsz
=
bsz
self
.
bptt
=
bptt
self
.
ext_len
=
ext_len
if
ext_len
is
not
None
else
0
self
.
device
=
device
self
.
shuffle
=
shuffle
def
get_sent_stream
(
self
,
path
):
sents
=
self
.
vocab
.
encode_file
(
path
,
add_double_eos
=
True
)
if
self
.
shuffle
:
np
.
random
.
shuffle
(
sents
)
sent_stream
=
iter
(
sents
)
return
sent_stream
def
__iter__
(
self
):
if
self
.
shuffle
:
np
.
random
.
shuffle
(
self
.
paths
)
for
path
in
self
.
paths
:
# sent_stream is an iterator
sent_stream
=
self
.
get_sent_stream
(
path
)
for
batch
in
self
.
stream_iterator
(
sent_stream
):
yield
batch
class
Corpus
(
object
):
def
__init__
(
self
,
path
,
dataset
,
*
args
,
**
kwargs
):
self
.
dataset
=
dataset
self
.
vocab
=
Vocab
(
*
args
,
**
kwargs
)
if
self
.
dataset
in
[
'ptb'
,
'wt2'
,
'enwik8'
,
'text8'
]:
self
.
vocab
.
count_file
(
os
.
path
.
join
(
path
,
'train.txt'
))
self
.
vocab
.
count_file
(
os
.
path
.
join
(
path
,
'valid.txt'
))
self
.
vocab
.
count_file
(
os
.
path
.
join
(
path
,
'test.txt'
))
elif
self
.
dataset
==
'wt103'
:
self
.
vocab
.
count_file
(
os
.
path
.
join
(
path
,
'train.txt'
))
elif
self
.
dataset
==
'lm1b'
:
train_path_pattern
=
os
.
path
.
join
(
path
,
'1-billion-word-language-modeling-benchmark-r13output'
,
'training-monolingual.tokenized.shuffled'
,
'news.en-*'
)
train_paths
=
glob
.
glob
(
train_path_pattern
)
# the vocab will load from file when build_vocab() is called
self
.
vocab
.
build_vocab
()
if
self
.
dataset
in
[
'ptb'
,
'wt2'
,
'wt103'
]:
self
.
train
=
self
.
vocab
.
encode_file
(
os
.
path
.
join
(
path
,
'train.txt'
),
ordered
=
True
)
self
.
valid
=
self
.
vocab
.
encode_file
(
os
.
path
.
join
(
path
,
'valid.txt'
),
ordered
=
True
)
self
.
test
=
self
.
vocab
.
encode_file
(
os
.
path
.
join
(
path
,
'test.txt'
),
ordered
=
True
)
elif
self
.
dataset
in
[
'enwik8'
,
'text8'
]:
self
.
train
=
self
.
vocab
.
encode_file
(
os
.
path
.
join
(
path
,
'train.txt'
),
ordered
=
True
,
add_eos
=
False
)
self
.
valid
=
self
.
vocab
.
encode_file
(
os
.
path
.
join
(
path
,
'valid.txt'
),
ordered
=
True
,
add_eos
=
False
)
self
.
test
=
self
.
vocab
.
encode_file
(
os
.
path
.
join
(
path
,
'test.txt'
),
ordered
=
True
,
add_eos
=
False
)
elif
self
.
dataset
==
'lm1b'
:
self
.
train
=
train_paths
self
.
valid
=
self
.
vocab
.
encode_file
(
os
.
path
.
join
(
path
,
'valid.txt'
),
ordered
=
False
,
add_double_eos
=
True
)
self
.
test
=
self
.
vocab
.
encode_file
(
os
.
path
.
join
(
path
,
'test.txt'
),
ordered
=
False
,
add_double_eos
=
True
)
def
get_iterator
(
self
,
split
,
*
args
,
**
kwargs
):
if
split
==
'train'
:
if
self
.
dataset
in
[
'ptb'
,
'wt2'
,
'wt103'
,
'enwik8'
,
'text8'
]:
data_iter
=
LMOrderedIterator
(
self
.
train
,
*
args
,
**
kwargs
)
elif
self
.
dataset
==
'lm1b'
:
kwargs
[
'shuffle'
]
=
True
data_iter
=
LMMultiFileIterator
(
self
.
train
,
self
.
vocab
,
*
args
,
**
kwargs
)
elif
split
in
[
'valid'
,
'test'
]:
data
=
self
.
valid
if
split
==
'valid'
else
self
.
test
if
self
.
dataset
in
[
'ptb'
,
'wt2'
,
'wt103'
,
'enwik8'
,
'text8'
]:
data_iter
=
LMOrderedIterator
(
data
,
*
args
,
**
kwargs
)
elif
self
.
dataset
==
'lm1b'
:
data_iter
=
LMShuffledIterator
(
data
,
*
args
,
**
kwargs
)
return
data_iter
def
get_lm_corpus
(
datadir
,
dataset
):
fn
=
os
.
path
.
join
(
datadir
,
'cache.pt'
)
fn_pickle
=
os
.
path
.
join
(
datadir
,
'cache.pkl'
)
if
os
.
path
.
exists
(
fn
):
print
(
'Loading cached dataset...'
)
corpus
=
torch
.
load
(
fn_pickle
)
elif
os
.
path
.
exists
(
fn
):
print
(
'Loading cached dataset from pickle...'
)
with
open
(
fn
,
"rb"
)
as
fp
:
corpus
=
pickle
.
load
(
fp
)
else
:
print
(
'Producing dataset {}...'
.
format
(
dataset
))
kwargs
=
{}
if
dataset
in
[
'wt103'
,
'wt2'
]:
kwargs
[
'special'
]
=
[
'<eos>'
]
kwargs
[
'lower_case'
]
=
False
elif
dataset
==
'ptb'
:
kwargs
[
'special'
]
=
[
'<eos>'
]
kwargs
[
'lower_case'
]
=
True
elif
dataset
==
'lm1b'
:
kwargs
[
'special'
]
=
[]
kwargs
[
'lower_case'
]
=
False
kwargs
[
'vocab_file'
]
=
os
.
path
.
join
(
datadir
,
'1b_word_vocab.txt'
)
elif
dataset
in
[
'enwik8'
,
'text8'
]:
pass
corpus
=
Corpus
(
datadir
,
dataset
,
**
kwargs
)
torch
.
save
(
corpus
,
fn
)
return
corpus
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment