Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
8fae1cdd
Commit
8fae1cdd
authored
Apr 01, 2020
by
Mohammad
Browse files
refactored evaluate_gpt2
parent
bf3ce751
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
400 additions
and
619 deletions
+400
-619
evaluate_gpt2.py
evaluate_gpt2.py
+0
-575
megatron/arguments.py
megatron/arguments.py
+0
-28
megatron/utils.py
megatron/utils.py
+6
-1
pretrain_gpt2.py
pretrain_gpt2.py
+4
-8
tasks/main.py
tasks/main.py
+4
-0
tasks/zeroshot_gpt2/datasets.py
tasks/zeroshot_gpt2/datasets.py
+161
-0
tasks/zeroshot_gpt2/detokenizer.py
tasks/zeroshot_gpt2/detokenizer.py
+30
-7
tasks/zeroshot_gpt2/evaluate.py
tasks/zeroshot_gpt2/evaluate.py
+195
-0
No files found.
evaluate_gpt2.py
deleted
100755 → 0
View file @
bf3ce751
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain BERT"""
import
os
import
json
import
math
import
random
import
numpy
as
np
import
torch
from
arguments
import
get_args
from
configure_data
import
configure_data
from
megatron.fp16
import
FP16_Module
from
megatron.fp16
import
FP16_Optimizer
from
megatron.learning_rates
import
AnnealingLR
from
megatron.model
import
GPT2Model
from
megatron.model
import
DistributedDataParallel
as
DDP
from
megatron
import
mpu
from
apex.optimizers
import
FusedAdam
as
Adam
from
megatron.utils
import
Timers
from
megatron.utils
import
load_checkpoint
from
megatron.utils
import
report_memory
from
megatron.utils
import
print_params_min_max_norm
from
megatron
import
print_rank_0
from
megatron.data_utils
import
make_tokenizer
from
detokenizer
import
*
def
get_model
(
args
):
"""Build the model."""
print_rank_0
(
'building GPT2 model ...'
)
model
=
GPT2Model
(
num_layers
=
args
.
num_layers
,
vocab_size
=
args
.
vocab_size
,
hidden_size
=
args
.
hidden_size
,
num_attention_heads
=
args
.
num_attention_heads
,
embedding_dropout_prob
=
args
.
hidden_dropout
,
attention_dropout_prob
=
args
.
attention_dropout
,
output_dropout_prob
=
args
.
hidden_dropout
,
max_sequence_length
=
args
.
max_position_embeddings
,
checkpoint_activations
=
args
.
checkpoint_activations
,
checkpoint_num_layers
=
args
.
checkpoint_num_layers
,
parallel_output
=
not
args
.
cloze_eval
)
print_rank_0
(
' > number of parameters: {}'
.
format
(
sum
([
p
.
nelement
()
for
p
in
model
.
parameters
()])))
# GPU allocation.
model
.
cuda
(
torch
.
cuda
.
current_device
())
# Fp16 conversion.
if
args
.
fp16
:
model
=
FP16_Module
(
model
)
# Wrap model for distributed training.
model
=
DDP
(
model
)
return
model
def
setup_model
(
args
):
"""Setup model and optimizer."""
model
=
get_model
(
args
)
if
args
.
load
is
not
None
:
_
=
load_checkpoint
(
model
,
None
,
None
,
args
)
return
model
def
get_masks_and_position_ids
(
data
,
eod_token
,
reset_position_ids
,
reset_attention_mask
):
# Extract batch size and sequence length.
batch_size
,
seq_length
=
data
.
size
()
# Attention mask (lower triangular).
if
reset_attention_mask
:
att_mask_batch
=
batch_size
else
:
att_mask_batch
=
1
attention_mask
=
torch
.
tril
(
torch
.
ones
(
(
att_mask_batch
,
seq_length
,
seq_length
),
device
=
data
.
device
)).
view
(
att_mask_batch
,
1
,
seq_length
,
seq_length
)
# Loss mask.
loss_mask
=
torch
.
ones
(
data
.
size
(),
dtype
=
torch
.
float
,
device
=
data
.
device
)
loss_mask
[
data
==
eod_token
]
=
0.0
# Position ids.
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
data
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
data
)
# We need to clone as the ids will be modifed based on batch index.
if
reset_position_ids
:
position_ids
=
position_ids
.
clone
()
if
reset_position_ids
or
reset_attention_mask
:
# Loop through the batches:
for
b
in
range
(
batch_size
):
# Find indecies where EOD token is.
eod_index
=
position_ids
[
b
,
data
[
b
]
==
eod_token
]
# Detach indecies from positions if going to modify positions.
if
reset_position_ids
:
eod_index
=
eod_index
.
clone
()
# Loop through EOD indecies:
prev_index
=
0
for
j
in
range
(
eod_index
.
size
()[
0
]):
i
=
eod_index
[
j
]
# Mask attention loss.
if
reset_attention_mask
:
attention_mask
[
b
,
0
,
(
i
+
1
):,
:(
i
+
1
)]
=
0
# Reset positions.
if
reset_position_ids
:
position_ids
[
b
,
(
i
+
1
):]
-=
(
i
+
1
-
prev_index
)
prev_index
=
i
+
1
return
attention_mask
,
loss_mask
,
position_ids
def
get_batch
(
data_iterator
,
args
,
timers
):
''' get_batch subdivides the source data into chunks of
length args.seq_length. If source is equal to the example
output of the data loading example, with a seq_length limit
of 2, we'd get the following two Variables for i = 0:
┌ a g m s ┐ ┌ b h n t ┐
└ b h n t ┘ └ c i o u ┘
Note that despite the name of the function, the subdivison of data is not
done along the batch dimension (i.e. dimension 1), since that was handled
by the data loader. The chunks are along dimension 0, corresponding
to the seq_len dimension in the LSTM. A Variable representing an appropriate
shard reset mask of the same dimensions is also returned.
'''
# Items and their type.
keys
=
[
'text'
,
'pad_mask'
]
datatype
=
torch
.
int64
# Broadcast data.
timers
(
'data loader'
).
start
()
if
data_iterator
is
not
None
:
data
=
next
(
data_iterator
)
else
:
data
=
None
timers
(
'data loader'
).
stop
()
data_b
=
mpu
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
tokens_
=
data_b
[
'text'
].
long
()
lm_labels
=
tokens_
[:,
1
:].
contiguous
()
tokens
=
tokens_
[:,
:
-
1
].
contiguous
()
padding_mask
=
data_b
[
'pad_mask'
].
byte
()
# Get the masks and postition ids.
attention_mask
,
loss_mask
,
position_ids
=
get_masks_and_position_ids
(
tokens
,
args
.
eod_token
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
)
# Convert
if
args
.
fp16
:
attention_mask
=
attention_mask
.
half
()
return
tokens
,
lm_labels
,
attention_mask
,
position_ids
,
padding_mask
def
forward_step
(
data_iterator
,
model
,
args
,
timers
):
"""Forward step."""
# Get the batch.
timers
(
'batch generator'
).
start
()
batch
=
get_batch
(
data_iterator
,
args
,
timers
)
if
batch
is
None
:
return
None
tokens
,
lm_labels
,
attention_mask
,
position_ids
,
loss_mask
=
batch
timers
(
'batch generator'
).
stop
()
# Forward model.
if
args
.
eval_hf
:
output
,
_
=
model
(
tokens
)
else
:
output
=
model
(
tokens
,
position_ids
,
attention_mask
)
if
not
args
.
cloze_eval
:
#losses = torch.nn.CrossEntropyLoss(reduce=False)(
losses
=
mpu
.
vocab_parallel_cross_entropy
(
output
.
contiguous
().
float
(),
lm_labels
.
contiguous
())
loss_mask
=
loss_mask
.
contiguous
()
loss_mask
=
loss_mask
.
view
(
-
1
)
lm_loss
=
torch
.
sum
(
losses
.
view
(
-
1
)
*
loss_mask
.
float
())
else
:
outputs
=
torch
.
argmax
(
output
,
-
1
)
correct
=
(
outputs
==
lm_labels
).
float
()
correct
[(
1
-
loss_mask
).
bool
()]
=
1
correct
=
correct
.
prod
(
-
1
)
lm_loss
=
correct
.
sum
()
# loss_mask = loss_mask.contiguous().view(-1).float()
# lm_loss = torch.sum(acc * loss_mask)
return
lm_loss
def
evaluate
(
data_loader
,
model
,
args
,
timers
,
num_iterations
=
None
):
"""Evaluation."""
# Turn on evaluation mode which disables dropout.
model
.
eval
()
total_lm_loss
=
0
if
num_iterations
is
not
None
:
max_iters
=
num_iterations
else
:
if
mpu
.
get_model_parallel_rank
()
==
0
:
max_iters_gpu
=
torch
.
cuda
.
LongTensor
([
len
(
data_loader
)])
else
:
max_iters_gpu
=
torch
.
cuda
.
LongTensor
([
0
])
torch
.
distributed
.
broadcast
(
max_iters_gpu
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
max_iters
=
max_iters_gpu
[
0
].
item
()
print_rank_0
(
'global rank: {} | max iters: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
max_iters
))
if
data_loader
is
not
None
:
data_iterator
=
iter
(
data_loader
)
else
:
data_iterator
=
None
with
torch
.
no_grad
():
iteration
=
0
while
iteration
<
max_iters
:
if
iteration
%
args
.
log_interval
==
0
:
print_rank_0
(
'global rank: {} | iteration: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
iteration
))
# Forward evaluation.
lm_loss
=
forward_step
(
data_iterator
,
model
,
args
,
timers
)
if
lm_loss
is
None
:
break
# Reduce across processes.
if
isinstance
(
model
,
DDP
):
torch
.
distributed
.
all_reduce
(
lm_loss
.
data
)
if
args
.
cloze_eval
:
lm_loss
.
data
=
lm_loss
.
data
/
args
.
world_size
else
:
lm_loss
.
data
=
lm_loss
.
data
/
args
.
model_parallel_size
if
not
args
.
cloze_eval
:
total_lm_loss
+=
lm_loss
.
data
.
detach
().
float
().
item
()
/
(
args
.
num_tokenized_tokens
-
1
)
else
:
total_lm_loss
+=
lm_loss
.
data
.
detach
().
float
().
item
()
iteration
+=
1
# Move model back to the train mode.
model
.
train
()
return
total_lm_loss
def
evaluate_and_print_results
(
prefix
,
data_iterator
,
model
,
args
,
timers
,
num_iterations
=
None
):
"""Helper function to evaluate and dump results on screen."""
if
not
args
.
cloze_eval
:
lm_loss
=
evaluate
(
data_iterator
,
model
,
args
,
timers
,
num_iterations
)
val_loss
=
lm_loss
ppl
=
math
.
exp
(
min
(
20
,
val_loss
))
token_ratio
=
(
args
.
num_tokenized_tokens
-
1
)
/
(
args
.
num_original_tokens
-
1
)
adjusted_ppl
=
math
.
exp
(
min
(
20
,
val_loss
*
token_ratio
))
print_rank_0
(
'-'
*
100
)
string
=
' validation results on {} | '
.
format
(
prefix
)
string
+=
'avg loss: {:.4E} | '
.
format
(
val_loss
)
string
+=
'ppl: {:.4E} | '
.
format
(
ppl
)
string
+=
'adjusted ppl: {:.4E} | '
.
format
(
adjusted_ppl
)
string
+=
'token ratio: {} |'
.
format
(
token_ratio
)
length
=
len
(
string
)
+
1
print_rank_0
(
'-'
*
length
)
print_rank_0
(
string
)
print_rank_0
(
'-'
*
length
)
return
val_loss
else
:
num_correct
=
evaluate
(
data_iterator
,
model
,
args
,
timers
,
num_iterations
)
acc
=
num_correct
/
args
.
num_examples
print_rank_0
(
'-'
*
100
)
string
=
' validation results on {} | '
.
format
(
prefix
)
string
+=
'number correct: {:.4E} | '
.
format
(
num_correct
)
string
+=
'total examples: {:.4E} | '
.
format
(
args
.
num_examples
)
string
+=
'avg accuracy: {:.4E}'
.
format
(
acc
)
length
=
len
(
string
)
+
1
print_rank_0
(
'-'
*
length
)
print_rank_0
(
string
)
print_rank_0
(
'-'
*
length
)
return
acc
def
initialize_distributed
(
args
):
"""Initialize torch.distributed."""
# Manually set the device ids.
device
=
args
.
rank
%
torch
.
cuda
.
device_count
()
if
args
.
local_rank
is
not
None
:
device
=
args
.
local_rank
torch
.
cuda
.
set_device
(
device
)
# Call the init process
init_method
=
'tcp://'
master_ip
=
os
.
getenv
(
'MASTER_ADDR'
,
'localhost'
)
master_port
=
os
.
getenv
(
'MASTER_PORT'
,
'6000'
)
init_method
+=
master_ip
+
':'
+
master_port
torch
.
distributed
.
init_process_group
(
backend
=
args
.
distributed_backend
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
,
init_method
=
init_method
)
# Set the model-parallel / data-parallel communicators.
mpu
.
initialize_model_parallel
(
args
.
model_parallel_size
)
def
set_random_seed
(
seed
):
"""Set random seed for reproducability."""
if
seed
is
not
None
and
seed
>
0
:
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
mpu
.
model_parallel_cuda_manual_seed
(
seed
)
class
LM_Eval_Dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
tokens
,
seq_len
,
pad_idx
,
overalapping_eval
=
None
,
**
kwargs
):
self
.
tokens
=
tokens
self
.
seq_len
=
seq_len
self
.
pad_idx
=
pad_idx
self
.
overalapping_eval
=
overalapping_eval
if
self
.
overalapping_eval
is
None
:
self
.
overalapping_eval
=
self
.
seq_len
self
.
overalapping_eval
=
max
(
1
,
self
.
overalapping_eval
)
self
.
total_targets
=
len
(
self
.
tokens
)
-
1
# remove first sequence tokens
targets
=
max
(
self
.
total_targets
-
self
.
overalapping_eval
,
0
)
self
.
total_sequences
=
max
(
math
.
ceil
(
targets
/
self
.
overalapping_eval
)
+
1
,
1
)
def
__len__
(
self
):
return
self
.
total_sequences
def
__getitem__
(
self
,
idx
):
start_idx
=
idx
*
self
.
overalapping_eval
end_idx
=
start_idx
+
self
.
seq_len
tokens
=
self
.
tokens
[
start_idx
:
end_idx
+
1
]
num_tokens
=
len
(
tokens
)
pad_mask
=
[
1
]
*
num_tokens
if
num_tokens
<
self
.
seq_len
+
1
:
num_pad
=
(
self
.
seq_len
+
1
-
num_tokens
)
pad_mask
+=
[
0
]
*
(
num_pad
)
tokens
+=
[
self
.
pad_idx
]
*
num_pad
pad_mask
=
np
.
array
(
pad_mask
[
1
:])
if
self
.
overalapping_eval
!=
self
.
seq_len
and
idx
!=
0
:
pad_mask
[:
-
self
.
overalapping_eval
]
*=
0
return
{
'text'
:
np
.
array
(
tokens
),
'pad_mask'
:
pad_mask
}
class
Lambada_Eval_Dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
path
,
tokenizer
,
seq_len
,
strict
=
False
,
**
kwargs
):
self
.
seq_len
=
seq_len
self
.
pad_idx
=
tokenizer
.
get_command
(
'pad'
).
Id
self
.
tokenizer
=
tokenizer
self
.
strict
=
strict
self
.
tokens
=
[]
self
.
labels
=
[]
with
open
(
path
,
'r'
)
as
f
:
for
line
in
f
.
readlines
():
text
=
json
.
loads
(
line
)[
'text'
]
tokens
,
labels
=
self
.
get_tokens
(
text
)
self
.
tokens
.
append
(
tokens
)
self
.
labels
.
append
(
labels
)
def
get_tokens
(
self
,
text
):
if
not
self
.
strict
:
tokens
=
self
.
tokenizer
.
EncodeAsIds
(
text
).
tokenization
return
tokens
[:
-
1
],
[
tokens
[
-
1
]]
last_token
=
text
.
split
()[
-
1
]
start_idx
=
text
.
rfind
(
last_token
)
beginning_tokens
=
self
.
tokenizer
.
EncodeAsIds
(
text
[:
start_idx
].
strip
()).
tokenization
last_token
=
self
.
tokenizer
.
EncodeAsIds
(
' '
+
last_token
).
tokenization
return
beginning_tokens
,
last_token
def
__len__
(
self
):
return
len
(
self
.
tokens
)
def
__getitem__
(
self
,
idx
):
tokens
=
self
.
tokens
[
idx
]
num_tokens
=
len
(
tokens
)
pad_mask
=
[
0
]
*
num_tokens
labels
=
self
.
labels
[
idx
]
pad_mask
+=
[
1
]
*
len
(
labels
)
tokens
=
tokens
+
labels
num_tokens
=
len
(
tokens
)
if
num_tokens
<
self
.
seq_len
+
1
:
num_pad
=
(
self
.
seq_len
+
1
-
num_tokens
)
pad_mask
+=
[
0
]
*
(
num_pad
)
tokens
+=
[
self
.
pad_idx
]
*
num_pad
pad_mask
=
np
.
array
(
pad_mask
[
1
:])
return
{
'text'
:
np
.
array
(
tokens
),
'pad_mask'
:
pad_mask
}
def
get_tokenizer
(
args
):
tokenizer_args
=
{
'tokenizer_type'
:
args
.
tokenizer_type
,
'corpus'
:
None
,
'model_path'
:
args
.
tokenizer_path
,
'vocab_size'
:
args
.
vocab_size
,
'model_type'
:
args
.
tokenizer_model_type
,
'cache_dir'
:
args
.
cache_dir
}
return
make_tokenizer
(
**
tokenizer_args
)
def
get_eval_data
(
args
):
val_dataloader
=
None
if
mpu
.
get_model_parallel_rank
()
==
0
:
eval_batch_size
=
args
.
eval_batch_size
eval_batch_size
=
args
.
batch_size
if
eval_batch_size
is
None
else
eval_batch_size
seq_len
=
args
.
seq_length
valid_data
=
args
.
valid_data
valid_data
=
valid_data
[
0
]
if
isinstance
(
valid_data
,
list
)
else
valid_data
tokenizer
=
get_tokenizer
(
args
)
if
not
args
.
cloze_eval
:
with
open
(
valid_data
,
"rb"
)
as
reader
:
entire_data
=
reader
.
read
().
decode
(
'utf-8'
)
num_original_tokens
=
len
(
entire_data
.
strip
().
split
(
" "
))
entire_data
=
get_detokenizer
(
valid_data
)(
entire_data
)
tokenized_data
=
tokenizer
.
EncodeAsIds
(
entire_data
).
tokenization
num_tokenized_tokens
=
len
(
tokenized_data
)
string
=
'Original Tokens: %d, Detokenized tokens: %d'
%
(
num_tokenized_tokens
,
num_original_tokens
)
print_rank_0
(
string
)
eod_token
=
tokenizer
.
get_command
(
'pad'
).
Id
val_dataset
=
LM_Eval_Dataset
(
tokenized_data
,
seq_len
,
eod_token
,
args
.
overlapping_eval
)
else
:
val_dataset
=
Lambada_Eval_Dataset
(
valid_data
,
tokenizer
,
seq_len
,
args
.
strict_lambada
)
num_tokenized_tokens
=
0
num_original_tokens
=
0
val_dataloader
=
torch
.
utils
.
data
.
DataLoader
(
val_dataset
,
batch_size
=
eval_batch_size
,
drop_last
=
False
)
before
=
tokenizer
.
num_tokens
after
=
before
multiple
=
args
.
make_vocab_size_divisible_by
*
\
mpu
.
get_model_parallel_world_size
()
while
(
after
%
multiple
)
!=
0
:
after
+=
1
print_rank_0
(
'> padded vocab (size: {}) with {} dummy tokens (new size: {})'
.
format
(
before
,
after
-
before
,
after
))
eod_token
=
tokenizer
.
get_command
(
'pad'
).
Id
num_examples
=
len
(
val_dataset
)
token_counts
=
torch
.
cuda
.
LongTensor
([
after
,
eod_token
,
num_examples
,
num_original_tokens
,
num_tokenized_tokens
])
else
:
token_counts
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
torch
.
distributed
.
broadcast
(
token_counts
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
args
.
vocab_size
=
token_counts
[
0
].
item
()
args
.
eod_token
=
token_counts
[
1
].
item
()
args
.
num_examples
=
token_counts
[
2
].
item
()
args
.
num_original_tokens
=
token_counts
[
3
].
item
()
args
.
num_tokenized_tokens
=
token_counts
[
4
].
item
()
print
(
'global rank: {} | vocab size: {} | eod token: {} | '
'num_examples: {} | num_original_tokens: {} | '
'num_tokenized_tokens: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
args
.
vocab_size
,
args
.
eod_token
,
args
.
num_examples
,
args
.
num_original_tokens
,
args
.
num_tokenized_tokens
))
return
val_dataloader
def
main
():
"""Main training program."""
print
(
'Evaluate GPT2 model'
)
# Disable CuDNN.
torch
.
backends
.
cudnn
.
enabled
=
False
# Timer.
timers
=
Timers
()
# Arguments.
args
=
get_args
()
# Pytorch distributed.
initialize_distributed
(
args
)
# Random seeds for reproducability.
set_random_seed
(
args
.
seed
)
# Data stuff.
eval_data
=
get_eval_data
(
args
)
# Model, optimizer, and learning rate.
if
args
.
eval_hf
:
from
pytorch_pretrained_bert
import
GPT2LMHeadModel
from
pytorch_pretrained_bert
import
GPT2Model
as
HFGPT2Model
if
args
.
num_layers
==
24
:
model_path
=
args
.
load
#model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M'
hfmodel
=
HFGPT2Model
.
from_pretrained
(
model_path
,
cache_dir
=
'gpt2_weights'
,
from_tf
=
True
).
cuda
()
model
=
GPT2LMHeadModel
(
hfmodel
.
config
)
model
.
transformer
.
load_state_dict
(
hfmodel
.
state_dict
())
model
.
cuda
()
else
:
model
=
GPT2LMHeadModel
.
from_pretrained
(
'gpt2'
,
cache_dir
=
'gpt2_weights'
).
cuda
()
else
:
if
args
.
load_openai
:
from
megatron.utils
import
move_weights
model_path
=
args
.
load
args
.
load
=
None
model
=
setup_model
(
args
)
from
pytorch_pretrained_bert
import
GPT2LMHeadModel
from
pytorch_pretrained_bert
import
GPT2Model
as
HFGPT2Model
model_path
=
'gpt2'
from_tf
=
False
print
(
'loading openai weights'
)
model
.
cpu
()
if
args
.
num_layers
==
24
:
#model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M'
hfmodel
=
HFGPT2Model
.
from_pretrained
(
model_path
,
cache_dir
=
'gpt2_weights'
,
from_tf
=
True
)
gpt2model
=
GPT2LMHeadModel
(
hfmodel
.
config
)
gpt2model
.
transformer
.
load_state_dict
(
hfmodel
.
state_dict
())
gpt2model
else
:
gpt2model
=
GPT2LMHeadModel
.
from_pretrained
(
'gpt2'
,
cache_dir
=
'gpt2_weights'
)
model2fill
=
model
while
isinstance
(
model2fill
,
(
DDP
,
FP16_Module
)):
model2fill
=
model2fill
.
module
move_weights
(
model2fill
,
gpt2model
)
model
.
cuda
()
else
:
model
=
setup_model
(
args
)
# Run on test data.
prefix
=
"wiki"
#os.path.basename(args.valid_data)
evaluate_and_print_results
(
prefix
,
eval_data
,
model
,
args
,
timers
)
if
__name__
==
"__main__"
:
main
()
megatron/arguments.py
View file @
8fae1cdd
...
...
@@ -355,35 +355,7 @@ def _add_gpt2_args(parser):
return
parser
def
add_evaluation_args
(
parser
):
"""Evaluation arguments."""
group
=
parser
.
add_argument_group
(
'validation'
,
'validation configurations'
)
group
.
add_argument
(
'--eval-batch-size'
,
type
=
int
,
default
=
None
,
help
=
'Data Loader batch size for evaluation datasets.'
'Defaults to `--batch-size`'
)
group
.
add_argument
(
'--eval-seq-length'
,
type
=
int
,
default
=
None
,
help
=
'Maximum sequence length to process for '
'evaluation. Defaults to `--seq-length`'
)
group
.
add_argument
(
'--eval-max-preds-per-seq'
,
type
=
int
,
default
=
None
,
help
=
'Maximum number of predictions to use for '
'evaluation. Defaults to '
'math.ceil(`--eval-seq-length`*.15/10)*10'
)
group
.
add_argument
(
'--overlapping-eval'
,
type
=
int
,
default
=
32
,
help
=
'sliding window for overlapping eval '
)
group
.
add_argument
(
'--cloze-eval'
,
action
=
'store_true'
,
help
=
'Evaluation dataset from `--valid-data` is a cloze task'
)
group
.
add_argument
(
'--strict-lambada'
,
action
=
'store_true'
,
help
=
'use more difficult formulation of lambada'
)
group
.
add_argument
(
'--eval-hf'
,
action
=
'store_true'
,
help
=
'perform evaluation with huggingface openai model.'
'use `--load` to specify weights path to be loaded'
)
group
.
add_argument
(
'--load-openai'
,
action
=
'store_true'
,
help
=
'load openai weights into our model. Use `--load` '
'to specify weights path to be loaded'
)
return
parser
def
add_text_generate_args
(
parser
):
"""Text generate arguments."""
...
...
megatron/utils.py
View file @
8fae1cdd
...
...
@@ -119,7 +119,8 @@ def get_ltor_masks_and_position_ids(data,
eod_token
,
reset_position_ids
,
reset_attention_mask
,
eod_mask_loss
):
eod_mask_loss
,
fp16
):
"""Build masks and position id for left to right model."""
# Extract batch size and sequence length.
...
...
@@ -169,4 +170,8 @@ def get_ltor_masks_and_position_ids(data,
position_ids
[
b
,
(
i
+
1
):]
-=
(
i
+
1
-
prev_index
)
prev_index
=
i
+
1
# Convert
if
fp16
:
attention_mask
=
attention_mask
.
half
()
return
attention_mask
,
loss_mask
,
position_ids
pretrain_gpt2.py
View file @
8fae1cdd
...
...
@@ -45,6 +45,7 @@ def model_provider():
def
get_batch
(
data_iterator
):
"""Generate a batch"""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
# Items and their type.
keys
=
[
'text'
]
...
...
@@ -65,13 +66,11 @@ def get_batch(data_iterator):
# Get the masks and postition ids.
attention_mask
,
loss_mask
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
args
.
eod_token
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
)
# Convert
if
args
.
fp16
:
attention_mask
=
attention_mask
.
half
()
args
.
eod_mask_loss
,
args
.
fp16
)
return
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
...
...
@@ -160,9 +159,6 @@ def get_train_val_test_data():
args
.
do_valid
=
flags
[
1
].
item
()
args
.
do_test
=
flags
[
2
].
item
()
tokenizer
=
get_tokenizer
()
args
.
eod_token
=
tokenizer
.
eod_id
return
train_data
,
val_data
,
test_data
...
...
tasks/main.py
View file @
8fae1cdd
...
...
@@ -43,6 +43,10 @@ def get_tasks_args(parser):
'for training.'
)
group
.
add_argument
(
'--valid-data'
,
nargs
=
'*'
,
default
=
None
,
help
=
'path(s) to the validation data.'
)
group
.
add_argument
(
'--overlapping-eval'
,
type
=
int
,
default
=
32
,
help
=
'Sliding window for overlapping evaluation.'
)
group
.
add_argument
(
'--strict-lambada'
,
action
=
'store_true'
,
help
=
'Use more difficult formulation of lambada.'
)
return
parser
...
...
tasks/zeroshot_gpt2/datasets.py
0 → 100644
View file @
8fae1cdd
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Zero-shot datasets."""
import
json
import
math
import
numpy
as
np
import
torch
from
megatron
import
get_args
from
megatron
import
get_tokenizer
from
megatron
import
print_rank_0
from
.detokenizer
import
get_detokenizer
def
build_dataset
(
task
):
"""Helper function to select and build dataset."""
if
task
==
'LAMBADA'
:
return
_build_lambada_dataset
()
if
task
==
'WIKITEXT103'
:
return
_build_wikitext103_dataset
()
raise
NotImplementedError
(
'dataset for {} task is not '
'implemented.'
.
format
(
task
))
class
_LMDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
tokens
,
seq_len
,
pad_idx
,
num_original_tokens
,
num_tokenized_tokens
,
overalapping_eval
=
None
):
self
.
tokens
=
tokens
self
.
seq_len
=
seq_len
self
.
pad_idx
=
pad_idx
self
.
overalapping_eval
=
overalapping_eval
if
self
.
overalapping_eval
is
None
:
self
.
overalapping_eval
=
self
.
seq_len
self
.
overalapping_eval
=
max
(
1
,
self
.
overalapping_eval
)
self
.
num_original_tokens
=
num_original_tokens
self
.
num_tokenized_tokens
=
num_tokenized_tokens
self
.
total_targets
=
len
(
self
.
tokens
)
-
1
# remove first sequence tokens
targets
=
max
(
self
.
total_targets
-
self
.
overalapping_eval
,
0
)
self
.
total_sequences
=
max
(
math
.
ceil
(
targets
/
self
.
overalapping_eval
)
+
1
,
1
)
def
__len__
(
self
):
return
self
.
total_sequences
def
__getitem__
(
self
,
idx
):
start_idx
=
idx
*
self
.
overalapping_eval
end_idx
=
start_idx
+
self
.
seq_len
tokens
=
self
.
tokens
[
start_idx
:
end_idx
+
1
]
num_tokens
=
len
(
tokens
)
pad_mask
=
[
1
]
*
num_tokens
if
num_tokens
<
self
.
seq_len
+
1
:
num_pad
=
(
self
.
seq_len
+
1
-
num_tokens
)
pad_mask
+=
[
0
]
*
(
num_pad
)
tokens
+=
[
self
.
pad_idx
]
*
num_pad
pad_mask
=
np
.
array
(
pad_mask
[
1
:])
if
self
.
overalapping_eval
!=
self
.
seq_len
and
idx
!=
0
:
pad_mask
[:
-
self
.
overalapping_eval
]
*=
0
return
{
'text'
:
np
.
array
(
tokens
),
'pad_mask'
:
pad_mask
}
class
_LambadaDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
path
,
pad_idx
,
tokenizer
,
seq_len
,
strict
=
False
):
print_rank_0
(
'> building lambada dataset from {} ...'
.
format
(
path
))
self
.
seq_len
=
seq_len
self
.
pad_idx
=
pad_idx
self
.
tokenizer
=
tokenizer
self
.
strict
=
strict
self
.
tokens
=
[]
self
.
labels
=
[]
with
open
(
path
,
'r'
)
as
f
:
for
line
in
f
.
readlines
():
text
=
json
.
loads
(
line
)[
'text'
]
tokens
,
labels
=
self
.
get_tokens
(
text
)
self
.
tokens
.
append
(
tokens
)
self
.
labels
.
append
(
labels
)
def
get_tokens
(
self
,
text
):
if
not
self
.
strict
:
tokens
=
self
.
tokenizer
.
tokenize
(
text
)
return
tokens
[:
-
1
],
[
tokens
[
-
1
]]
last_token
=
text
.
split
()[
-
1
]
start_idx
=
text
.
rfind
(
last_token
)
beginning_tokens
=
self
.
tokenizer
.
tokenize
(
text
[:
start_idx
].
strip
())
last_token
=
self
.
tokenizer
.
tokenize
(
' '
+
last_token
)
return
beginning_tokens
,
last_token
def
__len__
(
self
):
return
len
(
self
.
tokens
)
def
__getitem__
(
self
,
idx
):
tokens
=
self
.
tokens
[
idx
]
num_tokens
=
len
(
tokens
)
pad_mask
=
[
0
]
*
num_tokens
labels
=
self
.
labels
[
idx
]
pad_mask
+=
[
1
]
*
len
(
labels
)
tokens
=
tokens
+
labels
num_tokens
=
len
(
tokens
)
if
num_tokens
<
self
.
seq_len
+
1
:
num_pad
=
(
self
.
seq_len
+
1
-
num_tokens
)
pad_mask
+=
[
0
]
*
(
num_pad
)
tokens
+=
[
self
.
pad_idx
]
*
num_pad
pad_mask
=
np
.
array
(
pad_mask
[
1
:])
return
{
'text'
:
np
.
array
(
tokens
),
'pad_mask'
:
pad_mask
}
def
_build_lambada_dataset
():
"""Build lambada dataset."""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
assert
len
(
args
.
valid_data
)
==
1
val_dataset
=
_LambadaDataset
(
args
.
valid_data
,
tokenizer
.
eod
,
tokenizer
,
args
.
seq_length
,
args
.
strict_lambada
)
print_rank_0
(
' > found {} samples.'
.
format
(
len
(
val_dataset
)))
return
val_dataset
def
_build_wikitext103_dataset
():
""""""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
assert
len
(
args
.
valid_data
)
==
1
with
open
(
args
.
valid_data
,
"rb"
)
as
reader
:
entire_data
=
reader
.
read
().
decode
(
'utf-8'
)
num_original_tokens
=
len
(
entire_data
.
strip
().
split
(
" "
))
entire_data
=
get_detokenizer
(
args
.
valid_data
)(
entire_data
)
tokenized_data
=
tokenizer
.
tokenize
(
entire_data
)
num_tokenized_tokens
=
len
(
tokenized_data
)
val_dataset
=
_LMDataset
(
tokenized_data
,
args
.
seq_length
,
tokenizer
.
eod
,
num_original_tokens
,
num_tokenized_tokens
,
args
.
overlapping_eval
)
print_rank_0
(
' > number of original tokens: {}, number of detokenized '
'tokens: {}'
.
format
(
num_original_tokens
,
num_tokenized_tokens
))
return
val_dataset
detokenizer.py
→
tasks/zeroshot_gpt2/
detokenizer.py
100755 → 100644
View file @
8fae1cdd
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Detokenization."""
import
re
def
ptb_detokenizer
(
string
):
string
=
string
.
replace
(
" '"
,
"'"
)
string
=
string
.
replace
(
" '"
,
"'"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
" n't"
,
"n't"
)
...
...
@@ -44,17 +62,22 @@ def wikitext_detokenizer(string):
return
string
def
lambada_detokenizer
(
string
):
return
string
def
get_detokenizer
(
path
):
for
key
in
DETOKENIZERS
.
keys
():
if
key
in
path
:
print
(
key
)
return
DETOKENIZERS
[
key
]
DETOKENIZERS
=
{
_
DETOKENIZERS
=
{
'ptb'
:
ptb_detokenizer
,
'wikitext'
:
wikitext_detokenizer
,
'lambada'
:
lambada_detokenizer
,
}
def
get_detokenizer
(
path
):
for
key
in
DETOKENIZERS
.
keys
():
if
key
in
path
:
print
(
key
)
return
_DETOKENIZERS
[
key
]
tasks/zeroshot_gpt2/evaluate.py
0 → 100644
View file @
8fae1cdd
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPT2 zero-shot evaluation."""
import
math
import
torch
from
megatron
import
get_args
from
megatron
import
get_tokenizer
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron.checkpointing
import
load_checkpoint
from
megatron.model
import
GPT2Model
from
megatron.training
import
get_model
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
tasks.finetune_utils
import
build_data_loader
from
.dataset
import
build_dataset
def
get_model_provider
(
eval_metric
):
"""Based on evaluation metric set the parallel-output flag and
return the model provider."""
def
model_provider
():
"""Build the model."""
if
eval_metric
==
'loss'
:
parallel_output
=
True
elif
eval_metric
==
'accuracy'
:
parallel_output
=
False
else
:
raise
NotImplementedError
(
'output type for {} evaluation metric '
'is not supported.'
.
format
(
eval_metric
))
print_rank_0
(
'building GPT2 model ...'
)
model
=
GPT2Model
(
num_tokentypes
=
0
,
parallel_output
=
parallel_output
)
return
model
return
model_provider
def
process_batch
(
batch
):
"""Process batch and produce inputs for the model."""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
loss_mask
=
batch
[
'pad_mask'
].
long
().
cuda
().
contiguous
().
byte
()
tokens_
=
batch
[
'text'
].
long
().
cuda
().
contiguous
()
labels
=
tokens_
[:,
1
:].
contiguous
()
tokens
=
tokens_
[:,
:
-
1
].
contiguous
()
# Get the masks and postition ids.
attention_mask
,
_
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
)
return
tokens
,
labels
,
attention_mask
,
position_ids
,
loss_mask
def
forward_step
(
batch
,
model
,
eval_metric
):
"""Forward step."""
# Get the batch.
tokens
,
labels
,
attention_mask
,
position_ids
,
loss_mask
=
process_batch
(
batch
)
# Forward model.
output
=
model
(
tokens
,
position_ids
,
attention_mask
)
# For loss, return the unreduced loss.
if
eval_metric
==
'loss'
:
losses
=
mpu
.
vocab_parallel_cross_entropy
(
output
.
contiguous
().
float
(),
labels
.
contiguous
())
loss
=
torch
.
sum
(
losses
.
view
(
-
1
)
*
loss_mask
.
contiguous
().
view
(
-
1
).
float
())
return
loss
# For accuracy, return the number of correctly predicted samples.
if
eval_metric
==
'accuracy'
:
outputs
=
torch
.
argmax
(
output
,
-
1
)
correct
=
(
outputs
==
labels
).
float
()
correct
[(
1
-
loss_mask
).
bool
()]
=
1
correct
=
correct
.
prod
(
-
1
)
return
correct
.
sum
()
raise
NotImplementedError
(
'forward method for evaluation metric {} '
'is not implemented.'
.
format
(
eval_metric
))
def
evaluate
(
data_loader
,
model
,
eval_metric
):
"""Evaluation."""
args
=
get_args
()
# Turn on evaluation mode which disables dropout.
model
.
eval
()
total_output
=
0.0
with
torch
.
no_grad
():
# For all the batches in the dataset.
for
iteration
,
batch
in
enumerate
(
data_loader
):
if
iteration
%
args
.
log_interval
==
0
:
print_rank_0
(
'> working on iteration: {}'
.
format
(
iteration
))
# Forward evaluation.
output
=
forward_step
(
batch
,
model
,
eval_metric
)
# Reduce across processes.
torch
.
distributed
.
all_reduce
(
output
,
group
=
mpu
.
get_data_parallel_group
())
total_output
+=
output
return
total_output
def
evaluate_and_print_results
(
task
,
data_loader
,
model
,
eval_metric
):
"""Evaluate and print results on screen."""
# Evaluate and get results.
output
=
evaluate
(
data_loader
,
model
,
eval_metric
)
string
=
' validation results on {} | '
.
format
(
task
)
if
eval_metric
==
'loss'
:
num_tokenized_tokens
=
data_loader
.
dataset
.
num_tokenized_tokens
num_original_tokens
=
data_loader
.
dataset
.
num_original_tokens
val_loss
=
output
/
(
num_tokenized_tokens
-
1
)
ppl
=
math
.
exp
(
min
(
20
,
val_loss
))
token_ratio
=
(
num_tokenized_tokens
-
1
)
/
(
num_original_tokens
-
1
)
adjusted_ppl
=
math
.
exp
(
min
(
20
,
val_loss
*
token_ratio
))
string
+=
'avg loss: {:.4E} | '
.
format
(
val_loss
)
string
+=
'ppl: {:.4E} | '
.
format
(
ppl
)
string
+=
'adjusted ppl: {:.4E} | '
.
format
(
adjusted_ppl
)
string
+=
'token ratio: {} |'
.
format
(
token_ratio
)
elif
eval_metric
==
'accuracy'
:
num_examples
=
len
(
data_loader
.
dataset
)
acc
=
output
/
num_examples
string
+=
'number correct: {:.4E} | '
.
format
(
output
)
string
+=
'total examples: {:.4E} | '
.
format
(
num_examples
)
string
+=
'avg accuracy: {:.4E}'
.
format
(
acc
)
else
:
raise
NotImplementedError
(
'evaluation method for {} metric is not '
'implemented yet.'
.
format
(
eval_metric
))
length
=
len
(
string
)
+
1
print_rank_0
(
'-'
*
length
)
print_rank_0
(
string
)
print_rank_0
(
'-'
*
length
)
def
main
():
"""Main program."""
args
=
get_args
()
if
args
.
task
==
'LAMBADA'
:
eval_metric
=
'accuracy'
elif
args
.
task
==
'WIKITEXT103'
:
eval_metric
=
'loss'
else
:
raise
NotImplementedError
(
'{} task is not implemented.'
.
format
(
args
.
task
))
# Set up model and load checkpoint.
model
=
get_model
(
get_model_provider
(
eval_metric
))
if
args
.
load
is
not
None
:
_
=
load_checkpoint
(
model
,
None
,
None
)
# Data stuff.
dataset
=
build_dataset
(
args
.
task
)
dataloader
=
build_data_loader
(
dataset
,
args
.
batch_size
,
args
.
num_workers
,
drop_last
=
False
)
# Run evaluation.
evaluate_and_print_results
(
args
.
task
,
dataloader
,
model
,
eval_metric
)
print_rank_0
(
'done :-)'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment