Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
mlperf_transformer_v0.7
Commits
9e8a8c05
Commit
9e8a8c05
authored
Oct 14, 2024
by
jerrrrry
Browse files
Initial commit
parents
Changes
209
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
2104 additions
and
0 deletions
+2104
-0
implementations/pytorch/tests/test_sequence_scorer.py
implementations/pytorch/tests/test_sequence_scorer.py
+115
-0
implementations/pytorch/tests/test_train.py
implementations/pytorch/tests/test_train.py
+106
-0
implementations/pytorch/tests/test_utils.py
implementations/pytorch/tests/test_utils.py
+85
-0
implementations/pytorch/tests/utils.py
implementations/pytorch/tests/utils.py
+169
-0
implementations/pytorch/train.py
implementations/pytorch/train.py
+867
-0
implementations/pytorch/transformer.log
implementations/pytorch/transformer.log
+110
-0
implementations/pytorch/utils/__pycache__/tokenizer.cpython-310.pyc
...tions/pytorch/utils/__pycache__/tokenizer.cpython-310.pyc
+0
-0
implementations/pytorch/utils/tokenizer.py
implementations/pytorch/utils/tokenizer.py
+652
-0
optimizers.zip
optimizers.zip
+0
-0
No files found.
implementations/pytorch/tests/test_sequence_scorer.py
0 → 100644
View file @
9e8a8c05
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
argparse
import
unittest
import
torch
from
fairseq.sequence_scorer
import
SequenceScorer
import
tests.utils
as
test_utils
class
TestSequenceScorer
(
unittest
.
TestCase
):
def
test_sequence_scorer
(
self
):
# construct dummy dictionary
d
=
test_utils
.
dummy_dictionary
(
vocab_size
=
2
)
self
.
assertEqual
(
d
.
pad
(),
1
)
self
.
assertEqual
(
d
.
eos
(),
2
)
self
.
assertEqual
(
d
.
unk
(),
3
)
eos
=
d
.
eos
()
w1
=
4
w2
=
5
# construct dataloader
data
=
[
{
'source'
:
torch
.
LongTensor
([
w1
,
w2
,
eos
]),
'target'
:
torch
.
LongTensor
([
w1
,
w2
,
w1
,
eos
]),
},
{
'source'
:
torch
.
LongTensor
([
w2
,
eos
]),
'target'
:
torch
.
LongTensor
([
w2
,
w1
,
eos
]),
},
{
'source'
:
torch
.
LongTensor
([
w2
,
eos
]),
'target'
:
torch
.
LongTensor
([
w2
,
eos
]),
},
]
data_itr
=
test_utils
.
dummy_dataloader
(
data
)
# specify expected output probabilities
args
=
argparse
.
Namespace
()
unk
=
0.
args
.
beam_probs
=
[
# step 0:
torch
.
FloatTensor
([
# eos w1 w2
[
0.0
,
unk
,
0.6
,
0.4
],
# sentence 1
[
0.0
,
unk
,
0.4
,
0.6
],
# sentence 2
[
0.0
,
unk
,
0.7
,
0.3
],
# sentence 3
]),
# step 1:
torch
.
FloatTensor
([
# eos w1 w2
[
0.0
,
unk
,
0.2
,
0.7
],
# sentence 1
[
0.0
,
unk
,
0.8
,
0.2
],
# sentence 2
[
0.7
,
unk
,
0.1
,
0.2
],
# sentence 3
]),
# step 2:
torch
.
FloatTensor
([
# eos w1 w2
[
0.10
,
unk
,
0.50
,
0.4
],
# sentence 1
[
0.15
,
unk
,
0.15
,
0.7
],
# sentence 2
[
0.00
,
unk
,
0.00
,
0.0
],
# sentence 3
]),
# step 3:
torch
.
FloatTensor
([
# eos w1 w2
[
0.9
,
unk
,
0.05
,
0.05
],
# sentence 1
[
0.0
,
unk
,
0.00
,
0.0
],
# sentence 2
[
0.0
,
unk
,
0.00
,
0.0
],
# sentence 3
]),
]
expected_scores
=
[
[
0.6
,
0.7
,
0.5
,
0.9
],
# sentence 1
[
0.6
,
0.8
,
0.15
],
# sentence 2
[
0.3
,
0.7
],
# sentence 3
]
task
=
test_utils
.
TestTranslationTask
.
setup_task
(
args
,
d
,
d
)
model
=
task
.
build_model
(
args
)
scorer
=
SequenceScorer
([
model
],
task
.
target_dictionary
)
for
id
,
_src
,
_ref
,
hypos
in
scorer
.
score_batched_itr
(
data_itr
):
self
.
assertHypoTokens
(
hypos
[
0
],
data
[
id
][
'target'
])
self
.
assertHypoScore
(
hypos
[
0
],
expected_scores
[
id
])
def
assertHypoTokens
(
self
,
hypo
,
tokens
):
self
.
assertTensorEqual
(
hypo
[
'tokens'
],
torch
.
LongTensor
(
tokens
))
def
assertHypoScore
(
self
,
hypo
,
pos_probs
,
normalized
=
True
,
lenpen
=
1.
):
pos_scores
=
torch
.
FloatTensor
(
pos_probs
).
log
()
self
.
assertAlmostEqual
(
hypo
[
'positional_scores'
],
pos_scores
)
self
.
assertEqual
(
pos_scores
.
numel
(),
hypo
[
'tokens'
].
numel
())
score
=
pos_scores
.
sum
()
if
normalized
:
score
/=
pos_scores
.
numel
()
**
lenpen
self
.
assertLess
(
abs
(
score
-
hypo
[
'score'
]),
1e-6
)
def
assertAlmostEqual
(
self
,
t1
,
t2
):
self
.
assertEqual
(
t1
.
size
(),
t2
.
size
(),
"size mismatch"
)
self
.
assertLess
((
t1
-
t2
).
abs
().
max
(),
1e-4
)
def
assertTensorEqual
(
self
,
t1
,
t2
):
self
.
assertEqual
(
t1
.
size
(),
t2
.
size
(),
"size mismatch"
)
self
.
assertEqual
(
t1
.
ne
(
t2
).
long
().
sum
(),
0
)
if
__name__
==
'__main__'
:
unittest
.
main
()
implementations/pytorch/tests/test_train.py
0 → 100644
View file @
9e8a8c05
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
contextlib
from
io
import
StringIO
import
unittest
from
unittest.mock
import
MagicMock
,
patch
import
torch
from
fairseq
import
data
import
train
def
mock_trainer
(
epoch
,
num_updates
,
iterations_in_epoch
):
trainer
=
MagicMock
()
trainer
.
load_checkpoint
.
return_value
=
{
'train_iterator'
:
{
'epoch'
:
epoch
,
'iterations_in_epoch'
:
iterations_in_epoch
,
'shuffle'
:
False
,
},
}
trainer
.
get_num_updates
.
return_value
=
num_updates
return
trainer
def
mock_dict
():
d
=
MagicMock
()
d
.
pad
.
return_value
=
1
d
.
eos
.
return_value
=
2
d
.
unk
.
return_value
=
3
return
d
def
get_trainer_and_epoch_itr
(
epoch
,
epoch_size
,
num_updates
,
iterations_in_epoch
):
tokens
=
torch
.
LongTensor
(
list
(
range
(
epoch_size
)))
tokens_ds
=
data
.
TokenBlockDataset
(
tokens
,
[
len
(
tokens
)],
1
,
include_targets
=
False
)
trainer
=
mock_trainer
(
epoch
,
num_updates
,
iterations_in_epoch
)
epoch_itr
=
data
.
EpochBatchIterator
(
dataset
=
data
.
LanguagePairDataset
(
tokens_ds
,
tokens_ds
.
sizes
,
mock_dict
(),
shuffle
=
False
),
max_tokens
=
1
,
)
return
trainer
,
epoch_itr
class
TestLoadCheckpoint
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
patches
=
{
'os.makedirs'
:
MagicMock
(),
'os.path.join'
:
MagicMock
(),
'os.path.isfile'
:
MagicMock
(
return_value
=
True
),
}
self
.
applied_patches
=
[
patch
(
p
,
d
)
for
p
,
d
in
self
.
patches
.
items
()]
[
p
.
start
()
for
p
in
self
.
applied_patches
]
def
test_load_partial_checkpoint
(
self
):
with
contextlib
.
redirect_stdout
(
StringIO
()):
trainer
,
epoch_itr
=
get_trainer_and_epoch_itr
(
2
,
150
,
200
,
50
)
train
.
load_checkpoint
(
MagicMock
(),
trainer
,
epoch_itr
)
self
.
assertEqual
(
epoch_itr
.
epoch
,
2
)
self
.
assertEqual
(
epoch_itr
.
iterations_in_epoch
,
50
)
itr
=
epoch_itr
.
next_epoch_itr
(
shuffle
=
False
)
self
.
assertEqual
(
epoch_itr
.
epoch
,
2
)
self
.
assertEqual
(
epoch_itr
.
iterations_in_epoch
,
50
)
self
.
assertEqual
(
next
(
itr
)[
'net_input'
][
'src_tokens'
][
0
].
item
(),
50
)
self
.
assertEqual
(
epoch_itr
.
iterations_in_epoch
,
51
)
def
test_load_full_checkpoint
(
self
):
with
contextlib
.
redirect_stdout
(
StringIO
()):
trainer
,
epoch_itr
=
get_trainer_and_epoch_itr
(
2
,
150
,
300
,
150
)
train
.
load_checkpoint
(
MagicMock
(),
trainer
,
epoch_itr
)
itr
=
epoch_itr
.
next_epoch_itr
(
shuffle
=
False
)
self
.
assertEqual
(
epoch_itr
.
epoch
,
3
)
self
.
assertEqual
(
epoch_itr
.
iterations_in_epoch
,
0
)
self
.
assertEqual
(
next
(
itr
)[
'net_input'
][
'src_tokens'
][
0
].
item
(),
0
)
def
test_load_no_checkpoint
(
self
):
with
contextlib
.
redirect_stdout
(
StringIO
()):
trainer
,
epoch_itr
=
get_trainer_and_epoch_itr
(
0
,
150
,
0
,
0
)
self
.
patches
[
'os.path.isfile'
].
return_value
=
False
train
.
load_checkpoint
(
MagicMock
(),
trainer
,
epoch_itr
)
itr
=
epoch_itr
.
next_epoch_itr
(
shuffle
=
False
)
self
.
assertEqual
(
epoch_itr
.
epoch
,
1
)
self
.
assertEqual
(
epoch_itr
.
iterations_in_epoch
,
0
)
self
.
assertEqual
(
next
(
itr
)[
'net_input'
][
'src_tokens'
][
0
].
item
(),
0
)
def
tearDown
(
self
):
patch
.
stopall
()
if
__name__
==
'__main__'
:
unittest
.
main
()
implementations/pytorch/tests/test_utils.py
0 → 100644
View file @
9e8a8c05
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
unittest
import
torch
from
fairseq
import
utils
class
TestUtils
(
unittest
.
TestCase
):
def
test_convert_padding_direction
(
self
):
pad
=
1
left_pad
=
torch
.
LongTensor
([
[
2
,
3
,
4
,
5
,
6
],
[
1
,
7
,
8
,
9
,
10
],
[
1
,
1
,
1
,
11
,
12
],
])
right_pad
=
torch
.
LongTensor
([
[
2
,
3
,
4
,
5
,
6
],
[
7
,
8
,
9
,
10
,
1
],
[
11
,
12
,
1
,
1
,
1
],
])
self
.
assertAlmostEqual
(
right_pad
,
utils
.
convert_padding_direction
(
left_pad
,
pad
,
left_to_right
=
True
,
),
)
self
.
assertAlmostEqual
(
left_pad
,
utils
.
convert_padding_direction
(
right_pad
,
pad
,
right_to_left
=
True
,
),
)
def
test_make_positions
(
self
):
pad
=
1
left_pad_input
=
torch
.
LongTensor
([
[
9
,
9
,
9
,
9
,
9
],
[
1
,
9
,
9
,
9
,
9
],
[
1
,
1
,
1
,
9
,
9
],
])
left_pad_output
=
torch
.
LongTensor
([
[
2
,
3
,
4
,
5
,
6
],
[
1
,
2
,
3
,
4
,
5
],
[
1
,
1
,
1
,
2
,
3
],
])
right_pad_input
=
torch
.
LongTensor
([
[
9
,
9
,
9
,
9
,
9
],
[
9
,
9
,
9
,
9
,
1
],
[
9
,
9
,
1
,
1
,
1
],
])
right_pad_output
=
torch
.
LongTensor
([
[
2
,
3
,
4
,
5
,
6
],
[
2
,
3
,
4
,
5
,
1
],
[
2
,
3
,
1
,
1
,
1
],
])
self
.
assertAlmostEqual
(
left_pad_output
,
utils
.
make_positions
(
left_pad_input
,
pad
,
left_pad
=
True
),
)
self
.
assertAlmostEqual
(
right_pad_output
,
utils
.
make_positions
(
right_pad_input
,
pad
,
left_pad
=
False
),
)
def
assertAlmostEqual
(
self
,
t1
,
t2
):
self
.
assertEqual
(
t1
.
size
(),
t2
.
size
(),
"size mismatch"
)
self
.
assertLess
(
utils
.
item
((
t1
-
t2
).
abs
().
max
()),
1e-4
)
if
__name__
==
'__main__'
:
unittest
.
main
()
implementations/pytorch/tests/utils.py
0 → 100644
View file @
9e8a8c05
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
torch
from
fairseq
import
utils
from
fairseq.data
import
Dictionary
from
fairseq.data.language_pair_dataset
import
collate
from
fairseq.models
import
(
FairseqEncoder
,
FairseqIncrementalDecoder
,
FairseqModel
,
)
from
fairseq.tasks
import
FairseqTask
def
dummy_dictionary
(
vocab_size
,
prefix
=
'token_'
):
d
=
Dictionary
()
for
i
in
range
(
vocab_size
):
token
=
prefix
+
str
(
i
)
d
.
add_symbol
(
token
)
d
.
finalize
(
padding_factor
=
1
)
# don't add extra padding symbols
return
d
def
dummy_dataloader
(
samples
,
padding_idx
=
1
,
eos_idx
=
2
,
batch_size
=
None
,
):
if
batch_size
is
None
:
batch_size
=
len
(
samples
)
# add any missing data to samples
for
i
,
sample
in
enumerate
(
samples
):
if
'id'
not
in
sample
:
sample
[
'id'
]
=
i
# create dataloader
dataset
=
TestDataset
(
samples
)
dataloader
=
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_size
=
batch_size
,
collate_fn
=
(
lambda
samples
:
collate
(
samples
,
padding_idx
,
eos_idx
)),
)
return
iter
(
dataloader
)
class
TestDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
data
):
super
().
__init__
()
self
.
data
=
data
def
__getitem__
(
self
,
index
):
return
self
.
data
[
index
]
def
__len__
(
self
):
return
len
(
self
.
data
)
class
TestTranslationTask
(
FairseqTask
):
def
__init__
(
self
,
args
,
src_dict
,
tgt_dict
,
model
):
super
().
__init__
(
args
)
self
.
src_dict
=
src_dict
self
.
tgt_dict
=
tgt_dict
self
.
model
=
model
@
classmethod
def
setup_task
(
cls
,
args
,
src_dict
=
None
,
tgt_dict
=
None
,
model
=
None
):
return
cls
(
args
,
src_dict
,
tgt_dict
,
model
)
def
build_model
(
self
,
args
):
return
TestModel
.
build_model
(
args
,
self
)
@
property
def
source_dictionary
(
self
):
return
self
.
src_dict
@
property
def
target_dictionary
(
self
):
return
self
.
tgt_dict
class
TestModel
(
FairseqModel
):
def
__init__
(
self
,
encoder
,
decoder
):
super
().
__init__
(
encoder
,
decoder
)
@
classmethod
def
build_model
(
cls
,
args
,
task
):
encoder
=
TestEncoder
(
args
,
task
.
source_dictionary
)
decoder
=
TestIncrementalDecoder
(
args
,
task
.
target_dictionary
)
return
cls
(
encoder
,
decoder
)
class
TestEncoder
(
FairseqEncoder
):
def
__init__
(
self
,
args
,
dictionary
):
super
().
__init__
(
dictionary
)
self
.
args
=
args
def
forward
(
self
,
src_tokens
,
src_lengths
):
return
src_tokens
def
reorder_encoder_out
(
self
,
encoder_out
,
new_order
):
return
encoder_out
.
index_select
(
0
,
new_order
)
class
TestIncrementalDecoder
(
FairseqIncrementalDecoder
):
def
__init__
(
self
,
args
,
dictionary
):
super
().
__init__
(
dictionary
)
assert
hasattr
(
args
,
'beam_probs'
)
or
hasattr
(
args
,
'probs'
)
args
.
max_decoder_positions
=
getattr
(
args
,
'max_decoder_positions'
,
100
)
self
.
args
=
args
def
forward
(
self
,
prev_output_tokens
,
encoder_out
,
incremental_state
=
None
):
if
incremental_state
is
not
None
:
prev_output_tokens
=
prev_output_tokens
[:,
-
1
:]
bbsz
=
prev_output_tokens
.
size
(
0
)
vocab
=
len
(
self
.
dictionary
)
src_len
=
encoder_out
.
size
(
1
)
tgt_len
=
prev_output_tokens
.
size
(
1
)
# determine number of steps
if
incremental_state
is
not
None
:
# cache step number
step
=
utils
.
get_incremental_state
(
self
,
incremental_state
,
'step'
)
if
step
is
None
:
step
=
0
utils
.
set_incremental_state
(
self
,
incremental_state
,
'step'
,
step
+
1
)
steps
=
[
step
]
else
:
steps
=
list
(
range
(
tgt_len
))
# define output in terms of raw probs
if
hasattr
(
self
.
args
,
'probs'
):
assert
self
.
args
.
probs
.
dim
()
==
3
,
\
'expected probs to have size bsz*steps*vocab'
probs
=
self
.
args
.
probs
.
index_select
(
1
,
torch
.
LongTensor
(
steps
))
else
:
probs
=
torch
.
FloatTensor
(
bbsz
,
len
(
steps
),
vocab
).
zero_
()
for
i
,
step
in
enumerate
(
steps
):
# args.beam_probs gives the probability for every vocab element,
# starting with eos, then unknown, and then the rest of the vocab
if
step
<
len
(
self
.
args
.
beam_probs
):
probs
[:,
i
,
self
.
dictionary
.
eos
():]
=
self
.
args
.
beam_probs
[
step
]
else
:
probs
[:,
i
,
self
.
dictionary
.
eos
()]
=
1.0
# random attention
attn
=
torch
.
rand
(
bbsz
,
tgt_len
,
src_len
)
return
probs
,
attn
def
get_normalized_probs
(
self
,
net_output
,
log_probs
,
_
):
# the decoder returns probabilities directly
probs
=
net_output
[
0
]
if
log_probs
:
return
probs
.
log
()
else
:
return
probs
def
max_positions
(
self
):
return
self
.
args
.
max_decoder_positions
implementations/pytorch/train.py
0 → 100644
View file @
9e8a8c05
#!/usr/bin/env python3 -u
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
collections
import
itertools
import
os
import
math
import
torch
import
time
import
ctypes
import
random
import
sys
import
unicodedata
import
six
import
re
import
gc
from
copy
import
deepcopy
from
functools
import
reduce
from
six.moves
import
xrange
import
numpy
as
np
from
fairseq
import
data
,
distributed_utils
,
options
,
progress_bar
,
tasks
,
utils
,
tokenizer
from
fairseq.trainer
import
Trainer
from
fairseq.meters
import
AverageMeter
,
StopwatchMeter
,
TimeMeter
from
fairseq.sequence_generator
import
SequenceGenerator
from
fairseq.data
import
dictionary
from
fairseq.data
import
language_pair_dataset
from
mlperf_log_utils
import
log_start
,
log_end
,
log_event
,
barrier
from
mlperf_logging.mllog
import
constants
from
mlperf_logging
import
mllog
def
generate_seeds
(
rng
,
size
):
"""
Generate list of random seeds
:param rng: random number generator
:param size: length of the returned list
"""
seeds
=
[
rng
.
randint
(
0
,
2
**
32
-
1
)
for
_
in
range
(
size
)]
return
seeds
def
broadcast_seeds
(
seeds
,
device
):
"""
Broadcasts random seeds to all distributed workers.
Returns list of random seeds (broadcasted from workers with rank 0).
:param seeds: list of seeds (integers)
:param device: torch.device
"""
if
torch
.
distributed
.
is_available
()
and
torch
.
distributed
.
is_initialized
():
seeds_tensor
=
torch
.
LongTensor
(
seeds
).
to
(
device
)
torch
.
distributed
.
broadcast
(
seeds_tensor
,
0
)
seeds
=
seeds_tensor
.
tolist
()
return
seeds
def
setup_seeds
(
master_seed
,
epochs
,
device
,
rank
,
world_size
):
"""
Generates seeds from one master_seed.
Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
used to initialize per-worker random number generators (mostly for
dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
dataset before each epoch.
Seeds are generated on worker with rank 0 and broadcasted to all other
workers.
:param master_seed: master RNG seed used to initialize other generators
:param epochs: number of epochs
:param device: torch.device (used for distributed.broadcast)
"""
if
master_seed
is
None
:
# random master seed, random.SystemRandom() uses /dev/urandom on Unix
master_seed
=
random
.
SystemRandom
().
randint
(
0
,
2
**
32
-
1
)
if
rank
==
0
:
# master seed is reported only from rank=0 worker, it's to avoid
# confusion, seeds from rank=0 are later broadcasted to other
# workers
print
(
f
'Using random master seed:
{
master_seed
}
'
)
else
:
# master seed was specified from command line
print
(
f
'Using master seed from command line:
{
master_seed
}
'
)
# initialize seeding RNG
seeding_rng
=
random
.
Random
(
master_seed
)
# generate worker seeds, one seed for every distributed worker
worker_seeds
=
generate_seeds
(
seeding_rng
,
world_size
)
# generate seeds for data shuffling, one seed for every epoch
shuffling_seeds
=
generate_seeds
(
seeding_rng
,
epochs
)
# broadcast seeds from rank=0 to other workers
worker_seeds
=
broadcast_seeds
(
worker_seeds
,
device
)
shuffling_seeds
=
broadcast_seeds
(
shuffling_seeds
,
device
)
return
worker_seeds
,
shuffling_seeds
def
main
(
args
):
if
not
torch
.
cuda
.
is_available
():
raise
NotImplementedError
(
'Training on CPU is not supported'
)
torch
.
cuda
.
set_device
(
args
.
device_id
)
mllog
.
config
(
filename
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
'transformer.log'
))
mllogger
=
mllog
.
get_mllogger
()
mllogger
.
logger
.
propagate
=
False
log_start
(
key
=
constants
.
INIT_START
,
log_all_ranks
=
True
)
# preinit and warmup streams/groups for allreduce communicators
allreduce_communicators
=
None
if
args
.
distributed_world_size
>
1
and
args
.
enable_parallel_backward_allred_opt
:
allreduce_groups
=
[
torch
.
distributed
.
new_group
()
for
_
in
range
(
args
.
parallel_backward_allred_cuda_nstreams
)]
allreduce_streams
=
[
torch
.
cuda
.
Stream
()
for
_
in
range
(
args
.
parallel_backward_allred_cuda_nstreams
)]
for
group
,
stream
in
zip
(
allreduce_groups
,
allreduce_streams
):
with
torch
.
cuda
.
stream
(
stream
):
torch
.
distributed
.
all_reduce
(
torch
.
cuda
.
FloatTensor
(
1
),
group
=
group
)
allreduce_communicators
=
(
allreduce_groups
,
allreduce_streams
)
if
args
.
max_tokens
is
None
:
args
.
max_tokens
=
6000
print
(
args
)
log_event
(
key
=
constants
.
GLOBAL_BATCH_SIZE
,
value
=
args
.
max_tokens
*
args
.
distributed_world_size
)
log_event
(
key
=
constants
.
OPT_NAME
,
value
=
args
.
optimizer
)
assert
(
len
(
args
.
lr
)
==
1
)
log_event
(
key
=
constants
.
OPT_BASE_LR
,
value
=
args
.
lr
[
0
]
if
len
(
args
.
lr
)
==
1
else
args
.
lr
)
log_event
(
key
=
constants
.
OPT_LR_WARMUP_STEPS
,
value
=
args
.
warmup_updates
)
assert
(
args
.
max_source_positions
==
args
.
max_target_positions
)
log_event
(
key
=
constants
.
MAX_SEQUENCE_LENGTH
,
value
=
args
.
max_target_positions
,
metadata
=
{
'method'
:
'discard'
})
log_event
(
key
=
constants
.
OPT_ADAM_BETA_1
,
value
=
eval
(
args
.
adam_betas
)[
0
])
log_event
(
key
=
constants
.
OPT_ADAM_BETA_2
,
value
=
eval
(
args
.
adam_betas
)[
1
])
log_event
(
key
=
constants
.
OPT_ADAM_EPSILON
,
value
=
args
.
adam_eps
)
log_event
(
key
=
constants
.
SEED
,
value
=
args
.
seed
)
# L2 Sector Promotion
pValue
=
ctypes
.
cast
((
ctypes
.
c_int
*
1
)(),
ctypes
.
POINTER
(
ctypes
.
c_int
))
#result = ctypes.CDLL('/opt/dtk-24.04.1/cuda/targets/x86_64-linux/lib/libcudart.so').cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
#result = ctypes.CDLL('/opt/dtk-24.04.1/cuda/targets/x86_64-linux/lib/libcudart.so').cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
worker_seeds
,
shuffling_seeds
=
setup_seeds
(
args
.
seed
,
args
.
max_epoch
+
1
,
torch
.
device
(
'cuda'
),
args
.
distributed_rank
,
args
.
distributed_world_size
,
)
worker_seed
=
worker_seeds
[
args
.
distributed_rank
]
print
(
f
'Worker
{
args
.
distributed_rank
}
is using worker seed:
{
worker_seed
}
'
)
torch
.
manual_seed
(
worker_seed
)
# Setup task, e.g., translation, language modeling, etc.
task
=
tasks
.
setup_task
(
args
)
# Build model and criterion
model
=
task
.
build_model
(
args
)
criterion
=
task
.
build_criterion
(
args
)
print
(
'| model {}, criterion {}'
.
format
(
args
.
arch
,
criterion
.
__class__
.
__name__
))
print
(
'| num. model params: {}'
.
format
(
sum
(
p
.
numel
()
for
p
in
model
.
parameters
())))
# Build trainer
if
args
.
fp16
:
if
args
.
distributed_weight_update
!=
0
:
from
fairseq.fp16_trainer
import
DistributedFP16Trainer
trainer
=
DistributedFP16Trainer
(
args
,
task
,
model
,
criterion
,
allreduce_communicators
=
allreduce_communicators
)
else
:
from
fairseq.fp16_trainer
import
FP16Trainer
trainer
=
FP16Trainer
(
args
,
task
,
model
,
criterion
,
allreduce_communicators
=
allreduce_communicators
)
else
:
if
torch
.
cuda
.
get_device_capability
(
0
)[
0
]
>=
7
:
print
(
'| NOTICE: your device may support faster training with --fp16'
)
trainer
=
Trainer
(
args
,
task
,
model
,
criterion
,
allreduce_communicators
=
None
)
#if (args.online_eval or args.target_bleu) and not args.remove_bpe:
# args.remove_bpe='@@ '
print
(
'| training on {} GPUs'
.
format
(
args
.
distributed_world_size
))
print
(
'| max tokens per GPU = {} and max sentences per GPU = {}'
.
format
(
args
.
max_tokens
,
args
.
max_sentences
,
))
# Initialize dataloader
max_positions
=
trainer
.
get_model
().
max_positions
()
# Send a dummy batch to warm the caching allocator
dummy_batch
=
language_pair_dataset
.
get_dummy_batch_isolated
(
args
.
max_tokens
,
max_positions
,
8
)
trainer
.
dummy_train_step
(
dummy_batch
)
# Train until the learning rate gets too small or model reaches target score
max_epoch
=
args
.
max_epoch
if
args
.
max_epoch
>=
0
else
math
.
inf
max_update
=
args
.
max_update
or
math
.
inf
tgt_bleu
=
args
.
target_bleu
or
math
.
inf
current_bleu
=
0.0
lr
=
trainer
.
get_lr
()
train_meter
=
StopwatchMeter
()
train_meter
.
start
()
valid_losses
=
[
None
]
# mlperf compliance synchronization
if
args
.
distributed_world_size
>
1
:
assert
(
torch
.
distributed
.
is_initialized
())
torch
.
distributed
.
all_reduce
(
torch
.
cuda
.
FloatTensor
(
1
))
torch
.
cuda
.
synchronize
()
log_end
(
key
=
constants
.
INIT_STOP
,
sync
=
False
)
log_start
(
key
=
constants
.
RUN_START
,
sync
=
True
)
# second sync after RUN_START tag is printed.
# this ensures no rank touches data until after RUN_START tag is printed.
barrier
()
# Load dataset splits
load_dataset_splits
(
task
,
[
'train'
,
'test'
])
log_event
(
key
=
constants
.
TRAIN_SAMPLES
,
value
=
len
(
task
.
dataset
(
args
.
train_subset
)),
sync
=
False
)
log_event
(
key
=
constants
.
EVAL_SAMPLES
,
value
=
len
(
task
.
dataset
(
args
.
gen_subset
)),
sync
=
False
)
ctr
=
0
start
=
time
.
time
()
epoch_itr
=
data
.
EpochBatchIterator
(
dataset
=
task
.
dataset
(
args
.
train_subset
),
dataloader_num_workers
=
args
.
dataloader_num_workers
,
dataloader_pin_memory
=
args
.
enable_dataloader_pin_memory
,
max_tokens
=
args
.
max_tokens
,
max_sentences
=
args
.
max_sentences_valid
,
max_positions
=
max_positions
,
ignore_invalid_inputs
=
True
,
required_batch_size_multiple
=
8
,
seeds
=
shuffling_seeds
,
num_shards
=
args
.
distributed_world_size
,
shard_id
=
args
.
distributed_rank
,
epoch
=
epoch_itr
.
epoch
if
ctr
is
not
0
else
0
,
bucket_growth_factor
=
args
.
bucket_growth_factor
,
seq_len_multiple
=
args
.
seq_len_multiple
,
batching_scheme
=
args
.
batching_scheme
,
batch_multiple_strategy
=
args
.
batch_multiple_strategy
,
)
print
(
"got epoch iterator"
,
time
.
time
()
-
start
)
# Main training loop
while
lr
>=
args
.
min_lr
and
epoch_itr
.
epoch
<
max_epoch
and
trainer
.
get_num_updates
()
<
max_update
and
current_bleu
<
tgt_bleu
:
first_epoch
=
epoch_itr
.
epoch
+
1
log_start
(
key
=
constants
.
BLOCK_START
,
metadata
=
{
'first_epoch_num'
:
first_epoch
,
'epoch_count'
:
1
},
sync
=
False
)
log_start
(
key
=
constants
.
EPOCH_START
,
metadata
=
{
'epoch_num'
:
first_epoch
},
sync
=
False
)
gc
.
disable
()
# Load the latest checkpoint if one is available
if
ctr
is
0
:
load_checkpoint
(
args
,
trainer
,
epoch_itr
)
# train for one epoch
start
=
time
.
time
()
#exit(1)
train
(
args
,
trainer
,
task
,
epoch_itr
,
shuffling_seeds
)
print
(
"epoch time "
,
time
.
time
()
-
start
)
start
=
time
.
time
()
log_end
(
key
=
constants
.
EPOCH_STOP
,
metadata
=
{
'epoch_num'
:
first_epoch
},
sync
=
False
)
# Eval BLEU score
if
args
.
online_eval
or
(
not
tgt_bleu
is
math
.
inf
):
current_bleu
=
score
(
args
,
trainer
,
task
,
epoch_itr
,
args
.
gen_subset
)
log_event
(
key
=
constants
.
EVAL_ACCURACY
,
value
=
float
(
current_bleu
)
/
100.0
,
metadata
=
{
'epoch_num'
:
first_epoch
})
gc
.
enable
()
# Only use first validation loss to update the learning rate
#lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])
# Save checkpoint
#if epoch_itr.epoch % args.save_interval == 0:
# save_checkpoint(args, trainer, epoch_itr, valid_losses[0])
ctr
=
ctr
+
1
print
(
"validation and scoring "
,
time
.
time
()
-
start
)
log_end
(
key
=
constants
.
BLOCK_STOP
,
metadata
=
{
'first_epoch_num'
:
first_epoch
},
sync
=
False
)
train_meter
.
stop
()
status
=
'success'
if
current_bleu
>=
tgt_bleu
else
'aborted'
log_end
(
key
=
constants
.
RUN_STOP
,
metadata
=
{
'status'
:
status
})
print
(
'| done training in {:.1f} seconds'
.
format
(
train_meter
.
sum
))
def
train
(
args
,
trainer
,
task
,
epoch_itr
,
shuffling_seeds
):
"""Train the model for one epoch."""
# Initialize data iterator
itr
=
epoch_itr
.
next_epoch_itr
()
progress
=
progress_bar
.
build_progress_bar
(
args
,
itr
,
epoch_itr
.
epoch
,
no_progress_bar
=
'simple'
)
# update parameters every N batches
if
epoch_itr
.
epoch
<=
len
(
args
.
update_freq
):
update_freq
=
args
.
update_freq
[
epoch_itr
.
epoch
-
1
]
else
:
update_freq
=
args
.
update_freq
[
-
1
]
if
args
.
enable_parallel_backward_allred_opt
and
update_freq
>
1
:
raise
RuntimeError
(
'--enable-parallel-backward-allred-opt is incompatible with --update-freq > 1'
)
extra_meters
=
collections
.
defaultdict
(
lambda
:
AverageMeter
())
first_valid
=
args
.
valid_subset
.
split
(
','
)[
0
]
max_update
=
args
.
max_update
or
math
.
inf
num_batches
=
len
(
epoch_itr
)
if
args
.
time_step
:
begin
=
time
.
time
()
end
=
time
.
time
()
count
=
0
#profile_count = 13
profile_count
=
10000000000
for
i
,
sample
in
enumerate
(
progress
,
start
=
epoch_itr
.
iterations_in_epoch
):
if
args
.
time_step
:
start_step
=
time
.
time
()
if
i
<
num_batches
-
1
and
(
i
+
1
)
%
update_freq
>
0
:
# buffer updates according to --update-freq
trainer
.
train_step
(
sample
,
update_params
=
False
,
last_step
=
(
i
==
len
(
itr
)
-
1
))
continue
else
:
log_output
=
trainer
.
train_step
(
sample
,
update_params
=
True
,
last_step
=
(
i
==
len
(
itr
)
-
1
))
if
args
.
time_step
:
end_step
=
time
.
time
()
#if count > 10 and sample['target'].size(0) > 248 :
seqs
=
sample
[
'target'
].
size
(
0
)
srclen
=
sample
[
'net_input'
][
'src_tokens'
].
size
(
1
)
tgtlen
=
sample
[
'target'
].
size
(
1
)
srcbatch
=
srclen
*
seqs
tgtbatch
=
tgtlen
*
seqs
#print("ITER {}> Seqs: {} SrcLen: {} TgtLen: {} Src Batch: {} Tgt Batch {}".format( count, seqs, srclen, tgtlen, srcbatch, tgtbatch))
print
(
"ITER {}> Seqs: {} SrcLen: {} TgtLen: {} Total Time: {:.3} Step Time: {:.3} Load Time: {:.3}"
.
format
(
\
count
,
\
sample
[
'target'
].
size
(
0
),
\
sample
[
'net_input'
][
'src_tokens'
].
size
(
1
),
\
sample
[
'target'
].
size
(
1
),
\
(
end_step
-
begin
)
*
1000.0
,
\
(
end_step
-
start_step
)
*
1000.0
,
\
(
start_step
-
end
)
*
1000.0
))
count
+=
1
begin
=
time
.
time
()
# log mid-epoch stats
stats
=
get_training_stats
(
trainer
)
for
k
,
v
in
log_output
.
items
():
if
k
in
[
'loss'
,
'nll_loss'
,
'sample_size'
]:
continue
# these are already logged above
if
'loss'
in
k
:
extra_meters
[
k
].
update
(
v
,
log_output
[
'sample_size'
])
else
:
extra_meters
[
k
].
update
(
v
)
stats
[
k
]
=
extra_meters
[
k
].
avg
progress
.
log
(
stats
)
# ignore the first mini-batch in words-per-second calculation
if
i
==
0
:
trainer
.
get_meter
(
'wps'
).
reset
()
if
args
.
profile
is
not
None
and
i
==
args
.
profile
:
import
sys
sys
.
exit
()
num_updates
=
trainer
.
get_num_updates
()
if
args
.
save_interval_updates
>
0
and
num_updates
%
args
.
save_interval_updates
==
0
:
valid_losses
=
validate
(
args
,
trainer
,
task
,
epoch_itr
,
[
first_valid
],
shuffling_seeds
)
save_checkpoint
(
args
,
trainer
,
epoch_itr
,
valid_losses
[
0
])
if
num_updates
>=
max_update
:
break
if
args
.
time_step
:
end
=
time
.
time
()
# log end-of-epoch stats
stats
=
get_training_stats
(
trainer
)
for
k
,
meter
in
extra_meters
.
items
():
stats
[
k
]
=
meter
.
avg
progress
.
print
(
stats
)
# reset training meters
for
k
in
[
'train_loss'
,
'train_nll_loss'
,
'wps'
,
'ups'
,
'wpb'
,
'bsz'
,
'clip'
]:
meter
=
trainer
.
get_meter
(
k
)
if
meter
is
not
None
:
meter
.
reset
()
def
get_training_stats
(
trainer
):
stats
=
collections
.
OrderedDict
()
stats
[
'loss'
]
=
'{:.3f}'
.
format
(
trainer
.
get_meter
(
'train_loss'
).
avg
)
if
trainer
.
get_meter
(
'train_nll_loss'
).
count
>
0
:
nll_loss
=
trainer
.
get_meter
(
'train_nll_loss'
).
avg
stats
[
'nll_loss'
]
=
'{:.3f}'
.
format
(
nll_loss
)
else
:
nll_loss
=
trainer
.
get_meter
(
'train_loss'
).
avg
stats
[
'ppl'
]
=
get_perplexity
(
nll_loss
)
stats
[
'wps'
]
=
round
(
trainer
.
get_meter
(
'wps'
).
avg
)
stats
[
'ups'
]
=
'{:.1f}'
.
format
(
trainer
.
get_meter
(
'ups'
).
avg
)
stats
[
'wpb'
]
=
round
(
trainer
.
get_meter
(
'wpb'
).
avg
)
stats
[
'bsz'
]
=
round
(
trainer
.
get_meter
(
'bsz'
).
avg
)
stats
[
'num_updates'
]
=
trainer
.
get_num_updates
()
stats
[
'lr'
]
=
trainer
.
get_lr
()
stats
[
'gnorm'
]
=
'{:.3f}'
.
format
(
trainer
.
get_meter
(
'gnorm'
).
avg
)
stats
[
'clip'
]
=
'{:.0%}'
.
format
(
trainer
.
get_meter
(
'clip'
).
avg
)
stats
[
'oom'
]
=
trainer
.
get_meter
(
'oom'
).
avg
if
trainer
.
get_meter
(
'loss_scale'
)
is
not
None
:
stats
[
'loss_scale'
]
=
'{:.3f}'
.
format
(
trainer
.
get_meter
(
'loss_scale'
).
avg
)
stats
[
'wall'
]
=
round
(
trainer
.
get_meter
(
'wall'
).
elapsed_time
)
return
stats
def
validate
(
args
,
trainer
,
task
,
epoch_itr
,
subsets
,
shuffling_seeds
):
"""Evaluate the model on the validation set(s) and return the losses."""
valid_losses
=
[]
for
subset
in
subsets
:
# Initialize data iterator
itr
=
data
.
EpochBatchIterator
(
dataset
=
task
.
dataset
(
subset
),
max_tokens
=
args
.
max_tokens
,
max_sentences
=
args
.
max_sentences_valid
,
max_positions
=
trainer
.
get_model
().
max_positions
(),
ignore_invalid_inputs
=
args
.
skip_invalid_size_inputs_valid_test
,
required_batch_size_multiple
=
8
,
seeds
=
shuffling_seeds
,
num_shards
=
args
.
distributed_world_size
,
shard_id
=
args
.
distributed_rank
,
bucket_growth_factor
=
args
.
bucket_growth_factor
,
seq_len_multiple
=
args
.
seq_len_multiple
,
batching_scheme
=
args
.
batching_scheme
,
batch_multiple_strategy
=
args
.
batch_multiple_strategy
,
).
next_epoch_itr
(
shuffle
=
False
)
progress
=
progress_bar
.
build_progress_bar
(
args
,
itr
,
epoch_itr
.
epoch
,
prefix
=
'valid on
\'
{}
\'
subset'
.
format
(
subset
),
no_progress_bar
=
'simple'
)
# reset validation loss meters
for
k
in
[
'valid_loss'
,
'valid_nll_loss'
]:
meter
=
trainer
.
get_meter
(
k
)
if
meter
is
not
None
:
meter
.
reset
()
extra_meters
=
collections
.
defaultdict
(
lambda
:
AverageMeter
())
for
sample
in
progress
:
log_output
=
trainer
.
valid_step
(
sample
)
for
k
,
v
in
log_output
.
items
():
if
k
in
[
'loss'
,
'nll_loss'
,
'sample_size'
]:
continue
extra_meters
[
k
].
update
(
v
)
# log validation stats
stats
=
get_valid_stats
(
trainer
)
for
k
,
meter
in
extra_meters
.
items
():
stats
[
k
]
=
meter
.
avg
progress
.
print
(
stats
)
valid_losses
.
append
(
stats
[
'valid_loss'
])
return
valid_losses
def
_get_ngrams_with_counter
(
segment
,
max_order
):
"""Extracts all n-grams up to a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts
=
collections
.
Counter
()
for
order
in
xrange
(
1
,
max_order
+
1
):
for
i
in
xrange
(
0
,
len
(
segment
)
-
order
+
1
):
ngram
=
tuple
(
segment
[
i
:
i
+
order
])
ngram_counts
[
ngram
]
+=
1
return
ngram_counts
class
RefBleuStats
:
def
__init__
(
self
,
matches_by_order
,
possible_matches_by_order
,
reference_length
,
translation_length
):
self
.
matches_by_order
=
matches_by_order
self
.
possible_matches_by_order
=
possible_matches_by_order
self
.
reference_length
=
reference_length
self
.
translation_length
=
translation_length
def
__add__
(
self
,
other
):
return
RefBleuStats
(
[
a
+
b
for
a
,
b
in
zip
(
self
.
matches_by_order
,
other
.
matches_by_order
)],
[
a
+
b
for
a
,
b
in
zip
(
self
.
possible_matches_by_order
,
other
.
possible_matches_by_order
)],
self
.
reference_length
+
other
.
reference_length
,
self
.
translation_length
+
other
.
translation_length
)
def
compute_bleu
(
reference_corpus
,
translation_corpus
,
args
,
max_order
=
4
,
use_bp
=
True
):
"""Computes BLEU score of translated segments against one or more references.
Args:
reference_corpus: list of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
args: CLI arguments
max_order: Maximum n-gram order to use when computing BLEU score.
use_bp: boolean, whether to apply brevity penalty.
Returns:
BLEU score.
"""
reference_length
=
0
translation_length
=
0
bp
=
1.0
geo_mean
=
0
matches_by_order
=
[
0
]
*
max_order
possible_matches_by_order
=
[
0
]
*
max_order
precisions
=
[]
for
(
references
,
translations
)
in
zip
(
reference_corpus
,
translation_corpus
):
reference_length
+=
len
(
references
)
translation_length
+=
len
(
translations
)
ref_ngram_counts
=
_get_ngrams_with_counter
(
references
,
max_order
)
translation_ngram_counts
=
_get_ngrams_with_counter
(
translations
,
max_order
)
overlap
=
dict
((
ngram
,
min
(
count
,
translation_ngram_counts
[
ngram
]))
for
ngram
,
count
in
ref_ngram_counts
.
items
())
for
ngram
in
overlap
:
matches_by_order
[
len
(
ngram
)
-
1
]
+=
overlap
[
ngram
]
for
ngram
in
translation_ngram_counts
:
possible_matches_by_order
[
len
(
ngram
)
-
1
]
+=
translation_ngram_counts
[
ngram
]
precisions
=
[
0
]
*
max_order
smooth
=
1.0
# do reductions of matches_by_order and possible_matches_by_order
if
args
.
distributed_world_size
>
1
:
stats
=
RefBleuStats
(
matches_by_order
,
possible_matches_by_order
,
reference_length
,
translation_length
)
all_stats
=
distributed_utils
.
all_gather_list
(
stats
)
stats
=
reduce
(
lambda
a
,
b
:
a
+
b
,
all_stats
)
matches_by_order
=
stats
.
matches_by_order
possible_matches_by_order
=
stats
.
possible_matches_by_order
reference_length
=
stats
.
reference_length
translation_length
=
stats
.
translation_length
for
i
in
xrange
(
0
,
max_order
):
if
possible_matches_by_order
[
i
]
>
0
:
precisions
[
i
]
=
float
(
matches_by_order
[
i
])
/
possible_matches_by_order
[
i
]
if
matches_by_order
[
i
]
>
0
:
precisions
[
i
]
=
float
(
matches_by_order
[
i
])
/
possible_matches_by_order
[
i
]
else
:
smooth
*=
2
precisions
[
i
]
=
1.0
/
(
smooth
*
possible_matches_by_order
[
i
])
else
:
precisions
[
i
]
=
0.0
if
max
(
precisions
)
>
0
:
p_log_sum
=
sum
(
math
.
log
(
p
)
for
p
in
precisions
if
p
)
geo_mean
=
math
.
exp
(
p_log_sum
/
max_order
)
if
use_bp
:
if
reference_length
>
0
:
ratio
=
translation_length
/
reference_length
bp
=
math
.
exp
(
1
-
1.
/
ratio
)
if
ratio
<
1.0
else
1.0
else
:
bp
=
1.0
bleu
=
geo_mean
*
bp
return
np
.
float32
(
bleu
)
*
100.0
def
detokenize_subtokenized_sentence
(
subtokenized_sentence
):
l1
=
' '
.
join
(
''
.
join
(
subtokenized_sentence
.
strip
().
split
()).
split
(
'_'
))
l1
=
l1
.
replace
(
' ,'
,
','
)
l1
=
l1
.
replace
(
' .'
,
'.'
)
l1
=
l1
.
replace
(
' !'
,
'!'
)
l1
=
l1
.
replace
(
' ?'
,
'?'
)
l1
=
l1
.
replace
(
'
\'
'
,
'
\'
'
)
l1
=
l1
.
replace
(
' - '
,
'-'
)
l1
=
l1
.
strip
()
return
l1
class
UnicodeRegex
(
object
):
"""Ad-hoc hack to recognize all punctuation and symbols."""
def
__init__
(
self
):
punctuation
=
self
.
property_chars
(
"P"
)
self
.
nondigit_punct_re
=
re
.
compile
(
r
"([^\d])(["
+
punctuation
+
r
"])"
)
self
.
punct_nondigit_re
=
re
.
compile
(
r
"(["
+
punctuation
+
r
"])([^\d])"
)
self
.
symbol_re
=
re
.
compile
(
"(["
+
self
.
property_chars
(
"S"
)
+
"])"
)
def
property_chars
(
self
,
prefix
):
return
""
.
join
(
six
.
unichr
(
x
)
for
x
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
six
.
unichr
(
x
)).
startswith
(
prefix
))
uregex
=
UnicodeRegex
()
def
bleu_tokenize
(
string
):
r
"""Tokenize a string following the official BLEU implementation.
See https://github.com/moses-smt/mosesdecoder/'
'blob/master/scripts/generic/mteval-v14.pl#L954-L983
In our case, the input string is expected to be just one line
and no HTML entities de-escaping is needed.
So we just tokenize on punctuation and symbols,
except when a punctuation is preceded and followed by a digit
(e.g. a comma/dot as a thousand/decimal separator).
Note that a numer (e.g. a year) followed by a dot at the end of sentence
is NOT tokenized,
i.e. the dot stays with the number because `s/(\p{P})(\P{N})/ $1 $2/g`
does not match this case (unless we add a space after each sentence).
However, this error is already in the original mteval-v14.pl
and we want to be consistent with it.
Args:
string: the input string
Returns:
a list of tokens
"""
string
=
uregex
.
nondigit_punct_re
.
sub
(
r
"\1 \2 "
,
string
)
string
=
uregex
.
punct_nondigit_re
.
sub
(
r
" \1 \2"
,
string
)
string
=
uregex
.
symbol_re
.
sub
(
r
" \1 "
,
string
)
return
string
.
split
()
def
score
(
args
,
trainer
,
task
,
epoch_itr
,
subset
):
log_start
(
key
=
constants
.
EVAL_START
,
metadata
=
{
'epoch_num'
:
epoch_itr
.
epoch
},
sync
=
False
)
begin
=
time
.
time
()
if
not
subset
in
task
.
datasets
.
keys
():
task
.
load_dataset
(
subset
)
src_dict
=
deepcopy
(
task
.
source_dictionary
)
# This is necessary, generation of translations
tgt_dict
=
deepcopy
(
task
.
target_dictionary
)
# alters target dictionary messing up with the rest of training
model
=
trainer
.
get_model
()
# Initialize data iterator
itr
=
data
.
EpochBatchIterator
(
dataset
=
task
.
dataset
(
subset
),
max_tokens
=
min
(
2560
,
args
.
max_tokens
),
max_sentences
=
max
(
8
,
min
((
math
.
ceil
(
1024
/
args
.
distributed_world_size
)
//
4
)
*
4
,
128
)),
max_positions
=
(
256
,
256
),
ignore_invalid_inputs
=
args
.
skip_invalid_size_inputs_valid_test
,
required_batch_size_multiple
=
8
,
num_shards
=
args
.
distributed_world_size
,
shard_id
=
args
.
distributed_rank
,
seq_len_multiple
=
args
.
seq_len_multiple
,
# Use a large growth factor to get fewer buckets.
# Fewer buckets yield faster eval since batches are filled from single bucket
# and eval dataset is small.
bucket_growth_factor
=
1.2
,
batching_scheme
=
args
.
batching_scheme
,
batch_multiple_strategy
=
args
.
batch_multiple_strategy
,
).
next_epoch_itr
(
shuffle
=
False
)
# Initialize generator
gen_timer
=
StopwatchMeter
()
translator
=
SequenceGenerator
(
[
model
],
tgt_dict
,
beam_size
=
args
.
beam
,
stop_early
=
(
not
args
.
no_early_stop
),
normalize_scores
=
(
not
args
.
unnormalized
),
len_penalty
=
args
.
lenpen
,
sampling
=
args
.
sampling
,
sampling_topk
=
args
.
sampling_topk
,
minlen
=
args
.
min_len
,
)
# Generate and compute BLEU
ref_toks
=
[]
sys_toks
=
[]
num_sentences
=
0
has_target
=
True
if
args
.
log_translations
:
log
=
open
(
os
.
path
.
join
(
args
.
save_dir
,
'translations_epoch{}_{}'
.
format
(
epoch_itr
.
epoch
,
args
.
distributed_rank
)),
'w+'
)
with
progress_bar
.
build_progress_bar
(
args
,
itr
)
as
progress
:
translations
=
translator
.
generate_batched_itr
(
progress
,
maxlen_a
=
args
.
max_len_a
,
maxlen_b
=
args
.
max_len_b
,
cuda
=
True
,
timer
=
gen_timer
,
prefix_size
=
args
.
prefix_size
,
)
wps_meter
=
TimeMeter
()
for
sample_id
,
src_tokens
,
target_tokens
,
hypos
in
translations
:
# Process input and grount truth
has_target
=
target_tokens
is
not
None
target_tokens
=
target_tokens
.
int
().
cpu
()
if
has_target
else
None
src_str
=
src_dict
.
string
(
src_tokens
,
args
.
remove_bpe
)
if
has_target
:
target_str
=
tgt_dict
.
string
(
target_tokens
,
args
.
remove_bpe
)
if
args
.
log_translations
:
log
.
write
(
'S-{}
\t
{}
\n
'
.
format
(
sample_id
,
src_str
))
if
has_target
:
log
.
write
(
'T-{}
\t
{}
\n
'
.
format
(
sample_id
,
target_str
))
# Process top predictions
for
i
,
hypo
in
enumerate
(
hypos
[:
min
(
len
(
hypos
),
args
.
nbest
)]):
hypo_tokens
,
hypo_str
,
alignment
=
utils
.
post_process_prediction
(
hypo_tokens
=
hypo
[
'tokens'
].
int
().
cpu
(),
src_str
=
src_str
,
alignment
=
hypo
[
'alignment'
].
int
().
cpu
()
if
hypo
[
'alignment'
]
is
not
None
else
None
,
align_dict
=
None
,
tgt_dict
=
tgt_dict
,
remove_bpe
=
args
.
remove_bpe
)
if
args
.
log_translations
:
log
.
write
(
'H-{}
\t
{}
\t
{}
\n
'
.
format
(
sample_id
,
hypo
[
'score'
],
hypo_str
))
# log.write(str(hypo_tokens))
log
.
write
(
'P-{}
\t
{}
\n
'
.
format
(
sample_id
,
' '
.
join
(
map
(
lambda
x
:
'{:.4f}'
.
format
(
x
),
hypo
[
'positional_scores'
].
tolist
(),
))
))
# Score only the top hypothesis
if
has_target
and
i
==
0
:
src_str
=
detokenize_subtokenized_sentence
(
src_str
)
target_str
=
detokenize_subtokenized_sentence
(
target_str
)
hypo_str
=
detokenize_subtokenized_sentence
(
hypo_str
)
sys_tok
=
bleu_tokenize
((
hypo_str
.
lower
()
if
args
.
ignore_case
else
hypo_str
))
ref_tok
=
bleu_tokenize
((
target_str
.
lower
()
if
args
.
ignore_case
else
target_str
))
sys_toks
.
append
(
sys_tok
)
ref_toks
.
append
(
ref_tok
)
wps_meter
.
update
(
src_tokens
.
size
(
0
))
progress
.
log
({
'wps'
:
round
(
wps_meter
.
avg
)})
num_sentences
+=
1
bleu_score_reference
=
compute_bleu
(
ref_toks
,
sys_toks
,
args
)
bleu_score_reference_str
=
'{:.4f}'
.
format
(
bleu_score_reference
)
if
args
.
log_translations
:
log
.
close
()
if
gen_timer
.
sum
!=
0
:
print
(
'| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
.
format
(
num_sentences
,
gen_timer
.
n
,
gen_timer
.
sum
,
num_sentences
/
gen_timer
.
sum
,
1.
/
gen_timer
.
avg
))
if
has_target
:
print
(
'| Generate {} with beam={}: bleu_score={}'
.
format
(
subset
,
args
.
beam
,
bleu_score_reference_str
))
print
(
'| Eval completed in: {:.2f}s'
.
format
(
time
.
time
()
-
begin
))
log_end
(
key
=
constants
.
EVAL_STOP
,
metadata
=
{
'epoch_num'
:
epoch_itr
.
epoch
},
sync
=
False
)
return
bleu_score_reference
def
get_valid_stats
(
trainer
):
stats
=
collections
.
OrderedDict
()
stats
[
'valid_loss'
]
=
trainer
.
get_meter
(
'valid_loss'
).
avg
if
trainer
.
get_meter
(
'valid_nll_loss'
).
count
>
0
:
nll_loss
=
trainer
.
get_meter
(
'valid_nll_loss'
).
avg
stats
[
'valid_nll_loss'
]
=
nll_loss
else
:
nll_loss
=
trainer
.
get_meter
(
'valid_loss'
).
avg
stats
[
'valid_ppl'
]
=
get_perplexity
(
nll_loss
)
stats
[
'num_updates'
]
=
trainer
.
get_num_updates
()
if
hasattr
(
save_checkpoint
,
'best'
):
stats
[
'best'
]
=
min
(
save_checkpoint
.
best
,
stats
[
'valid_loss'
])
return
stats
def
get_perplexity
(
loss
):
try
:
return
'{:.2f}'
.
format
(
math
.
pow
(
2
,
loss
))
except
OverflowError
:
return
float
(
'inf'
)
def
save_checkpoint
(
args
,
trainer
,
epoch_itr
,
val_loss
):
if
args
.
no_save
or
not
distributed_utils
.
is_master
(
args
):
return
epoch
=
epoch_itr
.
epoch
end_of_epoch
=
epoch_itr
.
end_of_epoch
()
updates
=
trainer
.
get_num_updates
()
checkpoint_conds
=
collections
.
OrderedDict
()
checkpoint_conds
[
'checkpoint{}.pt'
.
format
(
epoch
)]
=
(
end_of_epoch
and
not
args
.
no_epoch_checkpoints
and
epoch
%
args
.
save_interval
==
0
)
checkpoint_conds
[
'checkpoint_{}_{}.pt'
.
format
(
epoch
,
updates
)]
=
(
not
end_of_epoch
and
args
.
save_interval_updates
>
0
and
updates
%
args
.
save_interval_updates
==
0
)
checkpoint_conds
[
'checkpoint_best.pt'
]
=
(
val_loss
is
not
None
and
(
not
hasattr
(
save_checkpoint
,
'best'
)
or
val_loss
<
save_checkpoint
.
best
)
)
checkpoint_conds
[
'checkpoint_last.pt'
]
=
True
# keep this last so that it's a symlink
prev_best
=
getattr
(
save_checkpoint
,
'best'
,
val_loss
)
if
val_loss
is
not
None
:
save_checkpoint
.
best
=
min
(
val_loss
,
prev_best
)
extra_state
=
{
'best'
:
save_checkpoint
.
best
,
'train_iterator'
:
epoch_itr
.
state_dict
(),
'val_loss'
:
val_loss
,
}
checkpoints
=
[
os
.
path
.
join
(
args
.
save_dir
,
fn
)
for
fn
,
cond
in
checkpoint_conds
.
items
()
if
cond
]
if
len
(
checkpoints
)
>
0
:
for
cp
in
checkpoints
:
trainer
.
save_checkpoint
(
cp
,
extra_state
)
if
not
end_of_epoch
and
args
.
keep_interval_updates
>
0
:
# remove old checkpoints; checkpoints are sorted in descending order
checkpoints
=
utils
.
checkpoint_paths
(
args
.
save_dir
,
pattern
=
r
'checkpoint_\d+_(\d+)\.pt'
)
for
old_chk
in
checkpoints
[
args
.
keep_interval_updates
:]:
os
.
remove
(
old_chk
)
def
load_checkpoint
(
args
,
trainer
,
epoch_itr
):
"""Load a checkpoint and replay dataloader to match."""
os
.
makedirs
(
args
.
save_dir
,
exist_ok
=
True
)
checkpoint_path
=
os
.
path
.
join
(
args
.
save_dir
,
args
.
restore_file
)
if
os
.
path
.
isfile
(
checkpoint_path
):
extra_state
=
trainer
.
load_checkpoint
(
checkpoint_path
)
if
extra_state
is
not
None
:
# replay train iterator to match checkpoint
epoch_itr
.
load_state_dict
(
extra_state
[
'train_iterator'
])
print
(
'| loaded checkpoint {} (epoch {} @ {} updates)'
.
format
(
checkpoint_path
,
epoch_itr
.
epoch
,
trainer
.
get_num_updates
()))
trainer
.
lr_step
(
epoch_itr
.
epoch
)
trainer
.
lr_step_update
(
trainer
.
get_num_updates
())
if
'best'
in
extra_state
:
save_checkpoint
.
best
=
extra_state
[
'best'
]
def
load_dataset_splits
(
task
,
splits
):
for
split
in
splits
:
if
split
==
'train'
:
task
.
load_dataset
(
split
,
combine
=
True
)
else
:
for
k
in
itertools
.
count
():
split_k
=
split
+
(
str
(
k
)
if
k
>
0
else
''
)
try
:
task
.
load_dataset
(
split_k
,
combine
=
False
)
except
FileNotFoundError
as
e
:
if
k
>
0
:
break
raise
e
if
__name__
==
'__main__'
:
parser
=
options
.
get_training_parser
()
args
=
options
.
parse_args_and_arch
(
parser
)
if
args
.
distributed_port
>
0
or
args
.
distributed_init_method
is
not
None
:
from
distributed_train
import
main
as
distributed_main
distributed_main
(
args
)
elif
args
.
distributed_world_size
>
1
:
from
multiprocessing_train
import
main
as
multiprocessing_main
multiprocessing_main
(
args
)
else
:
main
(
args
)
implementations/pytorch/transformer.log
0 → 100644
View file @
9e8a8c05
:::MLLOG {"namespace": "", "time_ms": 1728444225641, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728444225642, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728444225642, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728444225643, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728444225643, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728445229773, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728445229775, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728445229775, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728445229775, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728445229775, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728445371286, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728445371287, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 4096, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728445371287, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728445371287, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728445371287, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728452615867, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 1024, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 140}}
:::MLLOG {"namespace": "", "time_ms": 1728452615870, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.999, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 141}}
:::MLLOG {"namespace": "", "time_ms": 1728452615870, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-08, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 142}}
:::MLLOG {"namespace": "", "time_ms": 1728452615870, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1234, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 143}}
:::MLLOG {"namespace": "", "time_ms": 1728452980089, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 1024, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 140}}
:::MLLOG {"namespace": "", "time_ms": 1728452980092, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.999, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 141}}
:::MLLOG {"namespace": "", "time_ms": 1728452980092, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-08, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 142}}
:::MLLOG {"namespace": "", "time_ms": 1728452980092, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1234, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 143}}
:::MLLOG {"namespace": "", "time_ms": 1728453530231, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 1024, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
:::MLLOG {"namespace": "", "time_ms": 1728453530234, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 140}}
:::MLLOG {"namespace": "", "time_ms": 1728453530234, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.999, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 141}}
:::MLLOG {"namespace": "", "time_ms": 1728453530234, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-08, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 142}}
:::MLLOG {"namespace": "", "time_ms": 1728453530234, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1234, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 143}}
:::MLLOG {"namespace": "", "time_ms": 1728889462232, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889462232, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889462243, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889462246, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889463182, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889463193, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889463210, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889463219, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 81920, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.0019, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 750, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 64, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 140}}
:::MLLOG {"namespace": "", "time_ms": 1728889463221, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.98, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 141}}
:::MLLOG {"namespace": "", "time_ms": 1728889463221, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-09, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 142}}
:::MLLOG {"namespace": "", "time_ms": 1728889463221, "event_type": "POINT_IN_TIME", "key": "seed", "value": 22078, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 143}}
:::MLLOG {"namespace": "", "time_ms": 1728889761577, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889762201, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889762201, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889762208, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889762264, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889762284, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889762285, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889762290, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728889762290, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 81920, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728889762290, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.0019, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 750, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 64, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 140}}
:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.98, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 141}}
:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-09, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 142}}
:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "seed", "value": 17315, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 143}}
:::MLLOG {"namespace": "", "time_ms": 1728889771351, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 212}}
:::MLLOG {"namespace": "", "time_ms": 1728889771352, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 214}}
:::MLLOG {"namespace": "", "time_ms": 1728889771904, "event_type": "POINT_IN_TIME", "key": "train_samples", "value": 4590101, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 222}}
:::MLLOG {"namespace": "", "time_ms": 1728889771904, "event_type": "POINT_IN_TIME", "key": "eval_samples", "value": 3003, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 225}}
:::MLLOG {"namespace": "", "time_ms": 1728889773125, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 255, "first_epoch_num": 1, "epoch_count": 1}}
:::MLLOG {"namespace": "", "time_ms": 1728889773126, "event_type": "INTERVAL_START", "key": "epoch_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 258, "epoch_num": 1}}
:::MLLOG {"namespace": "", "time_ms": 1728890366314, "event_type": "INTERVAL_END", "key": "epoch_stop", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 273, "epoch_num": 1}}
:::MLLOG {"namespace": "", "time_ms": 1728890366315, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 640, "epoch_num": 1}}
:::MLLOG {"namespace": "", "time_ms": 1728890627512, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728890627674, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728890627712, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728890627725, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728890628433, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728890628454, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728890628462, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728890628464, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
:::MLLOG {"namespace": "", "time_ms": 1728890628464, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 81920, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 133}}
:::MLLOG {"namespace": "", "time_ms": 1728890628464, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 134}}
:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.0019, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 136}}
:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 750, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 137}}
:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 64, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 140}}
:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.98, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 141}}
:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-09, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 142}}
:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "seed", "value": 9431, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 143}}
:::MLLOG {"namespace": "", "time_ms": 1728890637403, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 212}}
:::MLLOG {"namespace": "", "time_ms": 1728890637404, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 214}}
:::MLLOG {"namespace": "", "time_ms": 1728890637971, "event_type": "POINT_IN_TIME", "key": "train_samples", "value": 4590101, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 222}}
:::MLLOG {"namespace": "", "time_ms": 1728890637971, "event_type": "POINT_IN_TIME", "key": "eval_samples", "value": 3003, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 225}}
:::MLLOG {"namespace": "", "time_ms": 1728890639238, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 255, "first_epoch_num": 1, "epoch_count": 1}}
:::MLLOG {"namespace": "", "time_ms": 1728890639239, "event_type": "INTERVAL_START", "key": "epoch_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 258, "epoch_num": 1}}
implementations/pytorch/utils/__pycache__/tokenizer.cpython-310.pyc
0 → 100644
View file @
9e8a8c05
File added
implementations/pytorch/utils/tokenizer.py
0 → 100644
View file @
9e8a8c05
# Copyright 2018 MLBenchmark Group. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines Subtokenizer class to encode and decode strings."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
os
import
re
import
sys
import
unicodedata
import
numpy
as
np
import
six
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
LUA
=
"<lua_index_compat>"
PAD
=
"<pad>_"
PAD_ID
=
1
EOS
=
"<EOS>_"
EOS_ID
=
2
UNK
=
"<bypass_unk>"
RESERVED_TOKENS
=
[
LUA
,
PAD
,
EOS
,
UNK
]
# Set of characters that will be used in the function _escape_token() (see func
# docstring for more details).
# This set is added to the alphabet list to ensure that all escaped tokens can
# be encoded.
_ESCAPE_CHARS
=
set
(
u
"
\\
_u;0123456789"
)
# Regex for the function _unescape_token(), the inverse of _escape_token().
# This is used to find "\u", "\\", and "\###;" substrings in the token.
_UNESCAPE_REGEX
=
re
.
compile
(
r
"\\u|\\\\|\\([0-9]+);"
)
_UNDEFINED_UNICODE
=
u
"
\u3013
"
# Set contains all letter and number characters.
_ALPHANUMERIC_CHAR_SET
=
set
(
six
.
unichr
(
i
)
for
i
in
xrange
(
sys
.
maxunicode
)
if
(
unicodedata
.
category
(
six
.
unichr
(
i
)).
startswith
(
"L"
)
or
unicodedata
.
category
(
six
.
unichr
(
i
)).
startswith
(
"N"
)))
# min_count is the minimum number of times a subtoken must appear in the data
# before before it is added to the vocabulary. The value is found using binary
# search to obtain the target vocabulary size.
_MIN_MIN_COUNT
=
1
# min value to use when binary searching for min_count
_MAX_MIN_COUNT
=
1000
# max value to use when binary searching for min_count
class
Subtokenizer
(
object
):
"""Encodes and decodes strings to/from integer IDs."""
def
__init__
(
self
,
vocab_file
,
reserved_tokens
=
None
):
"""Initializes class, creating a vocab file if data_files is provided."""
print
(
"Initializing Subtokenizer from file %s."
%
vocab_file
)
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
elif
reserved_tokens
is
'assumed_in_file'
:
reserved_tokens
=
[]
self
.
subtoken_list
=
_load_vocab_file
(
vocab_file
,
reserved_tokens
)
self
.
alphabet
=
_generate_alphabet_dict
(
self
.
subtoken_list
,
reserved_tokens
)
self
.
subtoken_to_id_dict
=
_list_to_index_dict
(
self
.
subtoken_list
)
self
.
max_subtoken_length
=
0
for
subtoken
in
self
.
subtoken_list
:
self
.
max_subtoken_length
=
max
(
self
.
max_subtoken_length
,
len
(
subtoken
))
# Create cache to speed up subtokenization
self
.
_cache_size
=
2
**
20
self
.
_cache
=
[(
None
,
None
)]
*
self
.
_cache_size
@
staticmethod
def
init_from_files
(
vocab_file
,
files
,
target_vocab_size
,
threshold
,
min_count
=
None
,
file_byte_limit
=
1e6
,
reserved_tokens
=
None
):
"""Create subtoken vocabulary based on files, and save vocab to file.
Args:
vocab_file: String name of vocab file to store subtoken vocabulary.
files: List of file paths that will be used to generate vocabulary.
target_vocab_size: target vocabulary size to generate.
threshold: int threshold of vocabulary size to accept.
min_count: int minimum count to use for generating the vocabulary. The min
count is the minimum number of times a subtoken should appear in the
files before it is added to the vocabulary. If set to none, this value
is found using binary search.
file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
will be drawn from the files.
reserved_tokens: List of string tokens that are guaranteed to be at the
beginning of the subtoken vocabulary list.
Returns:
Subtokenizer object
"""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
if
os
.
path
.
exists
(
vocab_file
):
print
(
"Vocab file already exists (%s)"
%
vocab_file
)
else
:
print
(
"Begin steps to create subtoken vocabulary..."
)
token_counts
=
_count_tokens
(
files
,
file_byte_limit
)
alphabet
=
_generate_alphabet_dict
(
token_counts
)
subtoken_list
=
_generate_subtokens_with_target_vocab_size
(
token_counts
,
alphabet
,
target_vocab_size
,
threshold
,
min_count
,
reserved_tokens
)
print
(
"Generated vocabulary with %d subtokens."
%
len
(
subtoken_list
))
_save_vocab_file
(
vocab_file
,
subtoken_list
)
return
Subtokenizer
(
vocab_file
)
@
staticmethod
def
init_from_existing_vocab_file
(
vocab_file
,
files
,
target_vocab_size
,
threshold
,
min_count
=
None
,
file_byte_limit
=
1e6
,
reserved_tokens
=
None
):
"""Create subtoken vocabulary based on files, and save vocab to file.
Args:
vocab_file: String name of vocab file to store subtoken vocabulary.
files: List of file paths that will be used to generate vocabulary.
target_vocab_size: target vocabulary size to generate.
threshold: int threshold of vocabulary size to accept.
min_count: int minimum count to use for generating the vocabulary. The min
count is the minimum number of times a subtoken should appear in the
files before it is added to the vocabulary. If set to none, this value
is found using binary search.
file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
will be drawn from the files.
reserved_tokens: List of string tokens that are guaranteed to be at the
beginning of the subtoken vocabulary list.
Returns:
Subtokenizer object
"""
if
os
.
path
.
exists
(
vocab_file
):
print
(
"Vocab file exists (%s)"
%
vocab_file
)
else
:
print
(
"Vocab file does not exist (%s)"
%
vocab_file
)
return
Subtokenizer
(
vocab_file
,
reserved_tokens
=
'assumed_in_file'
)
def
encode
(
self
,
raw_string
,
add_eos
=
False
):
"""Encodes a string into a list of int subtoken ids."""
ret
=
[]
tokens
=
_split_string_to_tokens
(
_native_to_unicode
(
raw_string
))
for
token
in
tokens
:
ret
.
extend
(
self
.
_token_to_subtoken_ids
(
token
))
if
add_eos
:
ret
.
append
(
EOS_ID
)
return
ret
def
_token_to_subtoken_ids
(
self
,
token
):
"""Encode a single token into a list of subtoken ids."""
cache_location
=
hash
(
token
)
%
self
.
_cache_size
cache_key
,
cache_value
=
self
.
_cache
[
cache_location
]
if
cache_key
==
token
:
return
cache_value
ret
=
_split_token_to_subtokens
(
_escape_token
(
token
,
self
.
alphabet
),
self
.
subtoken_to_id_dict
,
self
.
max_subtoken_length
)
ret
=
[
self
.
subtoken_to_id_dict
[
subtoken_id
]
for
subtoken_id
in
ret
]
self
.
_cache
[
cache_location
]
=
(
token
,
ret
)
return
ret
def
decode
(
self
,
subtokens
):
"""Converts list of int subtokens ids into a string."""
if
isinstance
(
subtokens
,
np
.
ndarray
):
# Note that list(subtokens) converts subtokens to a python list, but the
# items remain as np.int32. This converts both the array and its items.
subtokens
=
subtokens
.
tolist
()
if
not
subtokens
:
return
""
assert
isinstance
(
subtokens
,
list
)
and
isinstance
(
subtokens
[
0
],
int
),
(
"Subtokens argument passed into decode() must be a list of integers."
)
return
_unicode_to_native
(
_join_tokens_to_string
(
self
.
_subtoken_ids_to_tokens
(
subtokens
)))
def
_subtoken_ids_to_tokens
(
self
,
subtokens
):
"""Convert list of int subtoken ids to a list of string tokens."""
escaped_tokens
=
""
.
join
([
self
.
subtoken_list
[
s
]
for
s
in
subtokens
if
s
<
len
(
self
.
subtoken_list
)])
escaped_tokens
=
escaped_tokens
.
split
(
"_"
)
# All tokens in the vocabulary list have been escaped (see _escape_token())
# so each token must be unescaped when decoding.
ret
=
[]
for
token
in
escaped_tokens
:
if
token
:
ret
.
append
(
_unescape_token
(
token
))
return
ret
def
_save_vocab_file
(
vocab_file
,
subtoken_list
):
"""Save subtokens to file."""
with
open
(
vocab_file
,
mode
=
'w'
,
newline
=
'
\n
'
)
as
f
:
for
subtoken
in
subtoken_list
:
f
.
write
(
"'%s'
\n
"
%
_unicode_to_native
(
subtoken
))
def
_load_vocab_file
(
vocab_file
,
reserved_tokens
=
None
):
"""Load vocabulary while ensuring reserved tokens are at the top."""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
subtoken_list
=
[]
with
open
(
vocab_file
,
mode
=
'r'
,
newline
=
'
\n
'
)
as
f
:
for
line
in
f
:
subtoken
=
_native_to_unicode
(
line
.
strip
())
subtoken
=
subtoken
[
1
:
-
1
]
# Remove surrounding single-quotes
if
subtoken
in
reserved_tokens
:
continue
subtoken_list
.
append
(
_native_to_unicode
(
subtoken
))
return
reserved_tokens
+
subtoken_list
def
_native_to_unicode
(
s
):
"""Convert string to unicode (required in Python 2)."""
if
six
.
PY2
:
return
s
if
isinstance
(
s
,
unicode
)
else
s
.
decode
(
"utf-8"
)
else
:
return
s
def
_unicode_to_native
(
s
):
"""Convert string from unicode to native format (required in Python 2)."""
if
six
.
PY2
:
return
s
.
encode
(
"utf-8"
)
if
isinstance
(
s
,
unicode
)
else
s
else
:
return
s
def
_split_string_to_tokens
(
text
):
"""Splits text to a list of string tokens."""
if
not
text
:
return
[]
ret
=
[]
token_start
=
0
# Classify each character in the input string
is_alnum
=
[
c
in
_ALPHANUMERIC_CHAR_SET
for
c
in
text
]
for
pos
in
xrange
(
1
,
len
(
text
)):
if
is_alnum
[
pos
]
!=
is_alnum
[
pos
-
1
]:
token
=
text
[
token_start
:
pos
]
if
token
!=
u
" "
or
token_start
==
0
:
ret
.
append
(
token
)
token_start
=
pos
final_token
=
text
[
token_start
:]
ret
.
append
(
final_token
)
return
ret
def
_join_tokens_to_string
(
tokens
):
"""Join a list of string tokens into a single string."""
token_is_alnum
=
[
t
[
0
]
in
_ALPHANUMERIC_CHAR_SET
for
t
in
tokens
]
ret
=
[]
for
i
,
token
in
enumerate
(
tokens
):
if
i
>
0
and
token_is_alnum
[
i
-
1
]
and
token_is_alnum
[
i
]:
ret
.
append
(
u
" "
)
ret
.
append
(
token
)
return
""
.
join
(
ret
)
def
_escape_token
(
token
,
alphabet
):
r
"""Replace characters that aren't in the alphabet and append "_" to token.
Apply three transformations to the token:
1. Replace underline character "_" with "\u", and backslash "\" with "\\".
2. Replace characters outside of the alphabet with "\###;", where ### is the
character's Unicode code point.
3. Appends "_" to mark the end of a token.
Args:
token: unicode string to be escaped
alphabet: list of all known characters
Returns:
escaped string
"""
token
=
token
.
replace
(
u
"
\\
"
,
u
"
\\\\
"
).
replace
(
u
"_"
,
u
"
\\
u"
)
ret
=
[
c
if
c
in
alphabet
and
c
!=
u
"
\n
"
else
r
"\%d;"
%
ord
(
c
)
for
c
in
token
]
return
u
""
.
join
(
ret
)
+
"_"
def
_unescape_token
(
token
):
r
"""Replaces escaped characters in the token with their unescaped versions.
Applies inverse transformations as _escape_token():
1. Replace "\u" with "_", and "\\" with "\".
2. Replace "\###;" with the unicode character the ### refers to.
Args:
token: escaped string
Returns:
unescaped string
"""
def
match
(
m
):
r
"""Returns replacement string for matched object.
Matched objects contain one of the strings that matches the regex pattern:
r"\\u|\\\\|\\([0-9]+);"
The strings can be '\u', '\\', or '\###;' (### is any digit number).
m.group(0) refers to the entire matched string ('\u', '\\', or '\###;').
m.group(1) refers to the first parenthesized subgroup ('###').
m.group(0) exists for all match objects, while m.group(1) exists only for
the string '\###;'.
This function looks to see if m.group(1) exists. If it doesn't, then the
matched string must be '\u' or '\\' . In this case, the corresponding
replacement ('_' and '\') are returned. Note that in python, a single
backslash is written as '\\', and double backslash as '\\\\'.
If m.goup(1) exists, then use the integer in m.group(1) to return a
unicode character.
Args:
m: match object
Returns:
String to replace matched object with.
"""
# Check if the matched strings are '\u' or '\\'.
if
m
.
group
(
1
)
is
None
:
return
u
"_"
if
m
.
group
(
0
)
==
u
"
\\
u"
else
u
"
\\
"
# If m.group(1) exists, try and return unicode character.
try
:
return
six
.
unichr
(
int
(
m
.
group
(
1
)))
except
(
ValueError
,
OverflowError
)
as
_
:
return
_UNDEFINED_UNICODE
# Use match function to replace escaped substrings in the token.
return
_UNESCAPE_REGEX
.
sub
(
match
,
token
)
def
_count_tokens
(
files
,
file_byte_limit
=
1e6
):
"""Return token counts of words in the files.
Samples file_byte_limit bytes from each file, and counts the words that appear
in the samples. The samples are semi-evenly distributed across the file.
Args:
files: List of filepaths
file_byte_limit: Max number of bytes that will be read from each file.
Returns:
Dictionary mapping tokens to the number of times they appear in the sampled
lines from the files.
"""
token_counts
=
collections
.
defaultdict
(
int
)
for
filepath
in
files
:
with
open
(
filepath
,
mode
=
'r'
,
newline
=
'
\n
'
)
as
reader
:
file_byte_budget
=
file_byte_limit
counter
=
0
lines_to_skip
=
int
(
reader
.
size
()
/
(
file_byte_budget
*
2
))
for
line
in
reader
:
if
counter
<
lines_to_skip
:
counter
+=
1
else
:
if
file_byte_budget
<
0
:
break
line
=
line
.
strip
()
file_byte_budget
-=
len
(
line
)
counter
=
0
# Add words to token counts
for
token
in
_split_string_to_tokens
(
_native_to_unicode
(
line
)):
token_counts
[
token
]
+=
1
return
token_counts
def
_list_to_index_dict
(
lst
):
"""Create dictionary mapping list items to their indices in the list."""
return
{
item
:
n
for
n
,
item
in
enumerate
(
lst
)}
def
_split_token_to_subtokens
(
token
,
subtoken_dict
,
max_subtoken_length
):
"""Splits a token into subtokens defined in the subtoken dict."""
ret
=
[]
start
=
0
token_len
=
len
(
token
)
import
pdb
while
start
<
token_len
:
# Find the longest subtoken, so iterate backwards.
for
end
in
xrange
(
min
(
token_len
,
start
+
max_subtoken_length
),
start
,
-
1
):
subtoken
=
token
[
start
:
end
]
if
subtoken
in
subtoken_dict
:
ret
.
append
(
subtoken
)
start
=
end
break
else
:
# Did not break
# If there is no possible encoding of the escaped token then one of the
# characters in the token is not in the alphabet. This should be
# impossible and would be indicative of a bug.
#pdb.set_trace()
raise
ValueError
(
"Was unable to split token
\"
%s
\"
into subtokens."
%
token
)
return
ret
def
_generate_subtokens_with_target_vocab_size
(
token_counts
,
alphabet
,
target_size
,
threshold
,
min_count
=
None
,
reserved_tokens
=
None
):
"""Generate subtoken vocabulary close to the target size."""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
if
min_count
is
not
None
:
print
(
"Using min_count=%d to generate vocab with target size %d"
%
(
min_count
,
target_size
))
return
_generate_subtokens
(
token_counts
,
alphabet
,
min_count
,
reserved_tokens
=
reserved_tokens
)
def
bisect
(
min_val
,
max_val
):
"""Recursive function to binary search for subtoken vocabulary."""
cur_count
=
(
min_val
+
max_val
)
//
2
print
(
"Binary search: trying min_count=%d (%d %d)"
%
(
cur_count
,
min_val
,
max_val
))
subtoken_list
=
_generate_subtokens
(
token_counts
,
alphabet
,
cur_count
,
reserved_tokens
=
reserved_tokens
)
val
=
len
(
subtoken_list
)
print
(
"Binary search: min_count=%d resulted in %d tokens"
%
(
cur_count
,
val
))
within_threshold
=
abs
(
val
-
target_size
)
<
threshold
if
within_threshold
or
min_val
>=
max_val
or
cur_count
<
2
:
return
subtoken_list
if
val
>
target_size
:
other_subtoken_list
=
bisect
(
cur_count
+
1
,
max_val
)
else
:
other_subtoken_list
=
bisect
(
min_val
,
cur_count
-
1
)
# Return vocabulary dictionary with the closest number of tokens.
other_val
=
len
(
other_subtoken_list
)
if
abs
(
other_val
-
target_size
)
<
abs
(
val
-
target_size
):
return
other_subtoken_list
return
subtoken_list
print
(
"Finding best min_count to get target size of %d"
%
target_size
)
return
bisect
(
_MIN_MIN_COUNT
,
_MAX_MIN_COUNT
)
def
_generate_alphabet_dict
(
iterable
,
reserved_tokens
=
None
):
"""Create set of characters that appear in any element in the iterable."""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
elif
reserved_tokens
is
'assumed_in_file'
:
reserved_tokens
=
[]
alphabet
=
{
c
for
token
in
iterable
for
c
in
token
}
alphabet
|=
{
c
for
token
in
reserved_tokens
for
c
in
token
}
alphabet
|=
_ESCAPE_CHARS
# Add escape characters to alphabet set.
return
alphabet
def
_count_and_gen_subtokens
(
token_counts
,
alphabet
,
subtoken_dict
,
max_subtoken_length
):
"""Count number of times subtokens appear, and generate new subtokens.
Args:
token_counts: dict mapping tokens to the number of times they appear in the
original files.
alphabet: list of allowed characters. Used to escape the tokens, which
guarantees that all tokens can be split into subtokens.
subtoken_dict: dict mapping subtokens to ids.
max_subtoken_length: maximum length of subtoken in subtoken_dict.
Returns:
A defaultdict mapping subtokens to the number of times they appear in the
tokens. The dict may contain new subtokens.
"""
subtoken_counts
=
collections
.
defaultdict
(
int
)
for
token
,
count
in
six
.
iteritems
(
token_counts
):
token
=
_escape_token
(
token
,
alphabet
)
subtokens
=
_split_token_to_subtokens
(
token
,
subtoken_dict
,
max_subtoken_length
)
# Generate new subtokens by taking substrings from token.
start
=
0
for
subtoken
in
subtokens
:
for
end
in
xrange
(
start
+
1
,
len
(
token
)
+
1
):
new_subtoken
=
token
[
start
:
end
]
subtoken_counts
[
new_subtoken
]
+=
count
start
+=
len
(
subtoken
)
return
subtoken_counts
def
_filter_and_bucket_subtokens
(
subtoken_counts
,
min_count
):
"""Return a bucketed list of subtokens that are filtered by count.
Args:
subtoken_counts: defaultdict mapping subtokens to their counts
min_count: int count used to filter subtokens
Returns:
List of subtoken sets, where subtokens in set i have the same length=i.
"""
# Create list of buckets, where subtokens in bucket i have length i.
subtoken_buckets
=
[]
for
subtoken
,
count
in
six
.
iteritems
(
subtoken_counts
):
if
count
<
min_count
:
# Filter out subtokens that don't appear enough
continue
while
len
(
subtoken_buckets
)
<=
len
(
subtoken
):
subtoken_buckets
.
append
(
set
())
subtoken_buckets
[
len
(
subtoken
)].
add
(
subtoken
)
return
subtoken_buckets
def
_gen_new_subtoken_list
(
subtoken_counts
,
min_count
,
alphabet
,
reserved_tokens
=
None
):
"""Generate candidate subtokens ordered by count, and new max subtoken length.
Add subtokens to the candiate list in order of length (longest subtokens
first). When a subtoken is added, the counts of each of its prefixes are
decreased. Prefixes that don't appear much outside the subtoken are not added
to the candidate list.
For example:
subtoken being added to candidate list: 'translate'
subtoken_counts: {'translate':10, 't':40, 'tr':16, 'tra':12, ...}
min_count: 5
When 'translate' is added, subtoken_counts is updated to:
{'translate':0, 't':30, 'tr':6, 'tra': 2, ...}
The subtoken 'tra' will not be added to the candidate list, because it appears
twice (less than min_count) outside of 'translate'.
Args:
subtoken_counts: defaultdict mapping str subtokens to int counts
min_count: int minumum count requirement for subtokens
alphabet: set of characters. Each character is added to the subtoken list to
guarantee that all tokens can be encoded.
reserved_tokens: list of tokens that will be added to the beginning of the
returned subtoken list.
Returns:
List of candidate subtokens in decreasing count order, and maximum subtoken
length
"""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
# Create a list of (count, subtoken) for each candidate subtoken.
subtoken_candidates
=
[]
# Use bucketted list to iterate through subtokens in order of length.
# subtoken_buckets[i] = set(subtokens), where each subtoken has length i.
subtoken_buckets
=
_filter_and_bucket_subtokens
(
subtoken_counts
,
min_count
)
max_subtoken_length
=
len
(
subtoken_buckets
)
-
1
# Go through the list in reverse order to consider longer subtokens first.
for
subtoken_len
in
xrange
(
max_subtoken_length
,
0
,
-
1
):
for
subtoken
in
subtoken_buckets
[
subtoken_len
]:
count
=
subtoken_counts
[
subtoken
]
# Possible if this subtoken is a prefix of another token.
if
count
<
min_count
:
continue
# Ignore alphabet/reserved tokens, which will be added manually later.
if
subtoken
not
in
alphabet
and
subtoken
not
in
reserved_tokens
:
subtoken_candidates
.
append
((
count
,
subtoken
))
# Decrement count of the subtoken's prefixes (if a longer subtoken is
# added, its prefixes lose priority to be added).
for
end
in
xrange
(
1
,
subtoken_len
):
subtoken_counts
[
subtoken
[:
end
]]
-=
count
# Add alphabet subtokens (guarantees that all strings are encodable).
subtoken_candidates
.
extend
((
subtoken_counts
.
get
(
a
,
0
),
a
)
for
a
in
alphabet
)
# Order subtoken candidates by decreasing count.
subtoken_list
=
[
t
for
_
,
t
in
sorted
(
subtoken_candidates
,
reverse
=
True
)]
# Add reserved tokens to beginning of the list.
subtoken_list
=
reserved_tokens
+
subtoken_list
return
subtoken_list
,
max_subtoken_length
def
_generate_subtokens
(
token_counts
,
alphabet
,
min_count
,
num_iterations
=
4
,
reserved_tokens
=
None
):
"""Create a list of subtokens in decreasing order of frequency.
Args:
token_counts: dict mapping str tokens -> int count
alphabet: set of characters
min_count: int minimum number of times a subtoken must appear before it is
added to the vocabulary.
num_iterations: int number of iterations to generate new tokens.
reserved_tokens: list of tokens that will be added to the beginning to the
returned subtoken list.
Returns:
Sorted list of subtokens (most frequent first)
"""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
# Use alphabet set to create initial list of subtokens
subtoken_list
=
reserved_tokens
+
list
(
alphabet
)
max_subtoken_length
=
1
# On each iteration, segment all words using the subtokens defined in
# subtoken_dict, count how often the resulting subtokens appear, and update
# the dictionary with subtokens w/ high enough counts.
for
i
in
xrange
(
num_iterations
):
print
(
"
\t
Generating subtokens: iteration %d"
%
i
)
# Generate new subtoken->id dictionary using the new subtoken list.
subtoken_dict
=
_list_to_index_dict
(
subtoken_list
)
# Create dict mapping subtoken->count, with additional subtokens created
# from substrings taken from the tokens.
subtoken_counts
=
_count_and_gen_subtokens
(
token_counts
,
alphabet
,
subtoken_dict
,
max_subtoken_length
)
# Generate new list of subtokens sorted by subtoken count.
subtoken_list
,
max_subtoken_length
=
_gen_new_subtoken_list
(
subtoken_counts
,
min_count
,
alphabet
,
reserved_tokens
)
print
(
"
\t
Vocab size: %d"
%
len
(
subtoken_list
))
return
subtoken_list
optimizers.zip
0 → 100644
View file @
9e8a8c05
File added
Prev
1
…
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment