Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
8fae1cdd
Commit
8fae1cdd
authored
Apr 01, 2020
by
Mohammad
Browse files
refactored evaluate_gpt2
parent
bf3ce751
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
400 additions
and
619 deletions
+400
-619
evaluate_gpt2.py
evaluate_gpt2.py
+0
-575
megatron/arguments.py
megatron/arguments.py
+0
-28
megatron/utils.py
megatron/utils.py
+6
-1
pretrain_gpt2.py
pretrain_gpt2.py
+4
-8
tasks/main.py
tasks/main.py
+4
-0
tasks/zeroshot_gpt2/datasets.py
tasks/zeroshot_gpt2/datasets.py
+161
-0
tasks/zeroshot_gpt2/detokenizer.py
tasks/zeroshot_gpt2/detokenizer.py
+30
-7
tasks/zeroshot_gpt2/evaluate.py
tasks/zeroshot_gpt2/evaluate.py
+195
-0
No files found.
evaluate_gpt2.py
deleted
100755 → 0
View file @
bf3ce751
This diff is collapsed.
Click to expand it.
megatron/arguments.py
View file @
8fae1cdd
...
@@ -355,35 +355,7 @@ def _add_gpt2_args(parser):
...
@@ -355,35 +355,7 @@ def _add_gpt2_args(parser):
return
parser
return
parser
def
add_evaluation_args
(
parser
):
"""Evaluation arguments."""
group
=
parser
.
add_argument_group
(
'validation'
,
'validation configurations'
)
group
.
add_argument
(
'--eval-batch-size'
,
type
=
int
,
default
=
None
,
help
=
'Data Loader batch size for evaluation datasets.'
'Defaults to `--batch-size`'
)
group
.
add_argument
(
'--eval-seq-length'
,
type
=
int
,
default
=
None
,
help
=
'Maximum sequence length to process for '
'evaluation. Defaults to `--seq-length`'
)
group
.
add_argument
(
'--eval-max-preds-per-seq'
,
type
=
int
,
default
=
None
,
help
=
'Maximum number of predictions to use for '
'evaluation. Defaults to '
'math.ceil(`--eval-seq-length`*.15/10)*10'
)
group
.
add_argument
(
'--overlapping-eval'
,
type
=
int
,
default
=
32
,
help
=
'sliding window for overlapping eval '
)
group
.
add_argument
(
'--cloze-eval'
,
action
=
'store_true'
,
help
=
'Evaluation dataset from `--valid-data` is a cloze task'
)
group
.
add_argument
(
'--strict-lambada'
,
action
=
'store_true'
,
help
=
'use more difficult formulation of lambada'
)
group
.
add_argument
(
'--eval-hf'
,
action
=
'store_true'
,
help
=
'perform evaluation with huggingface openai model.'
'use `--load` to specify weights path to be loaded'
)
group
.
add_argument
(
'--load-openai'
,
action
=
'store_true'
,
help
=
'load openai weights into our model. Use `--load` '
'to specify weights path to be loaded'
)
return
parser
def
add_text_generate_args
(
parser
):
def
add_text_generate_args
(
parser
):
"""Text generate arguments."""
"""Text generate arguments."""
...
...
megatron/utils.py
View file @
8fae1cdd
...
@@ -119,7 +119,8 @@ def get_ltor_masks_and_position_ids(data,
...
@@ -119,7 +119,8 @@ def get_ltor_masks_and_position_ids(data,
eod_token
,
eod_token
,
reset_position_ids
,
reset_position_ids
,
reset_attention_mask
,
reset_attention_mask
,
eod_mask_loss
):
eod_mask_loss
,
fp16
):
"""Build masks and position id for left to right model."""
"""Build masks and position id for left to right model."""
# Extract batch size and sequence length.
# Extract batch size and sequence length.
...
@@ -169,4 +170,8 @@ def get_ltor_masks_and_position_ids(data,
...
@@ -169,4 +170,8 @@ def get_ltor_masks_and_position_ids(data,
position_ids
[
b
,
(
i
+
1
):]
-=
(
i
+
1
-
prev_index
)
position_ids
[
b
,
(
i
+
1
):]
-=
(
i
+
1
-
prev_index
)
prev_index
=
i
+
1
prev_index
=
i
+
1
# Convert
if
fp16
:
attention_mask
=
attention_mask
.
half
()
return
attention_mask
,
loss_mask
,
position_ids
return
attention_mask
,
loss_mask
,
position_ids
pretrain_gpt2.py
View file @
8fae1cdd
...
@@ -45,6 +45,7 @@ def model_provider():
...
@@ -45,6 +45,7 @@ def model_provider():
def
get_batch
(
data_iterator
):
def
get_batch
(
data_iterator
):
"""Generate a batch"""
"""Generate a batch"""
args
=
get_args
()
args
=
get_args
()
tokenizer
=
get_tokenizer
()
# Items and their type.
# Items and their type.
keys
=
[
'text'
]
keys
=
[
'text'
]
...
@@ -65,13 +66,11 @@ def get_batch(data_iterator):
...
@@ -65,13 +66,11 @@ def get_batch(data_iterator):
# Get the masks and postition ids.
# Get the masks and postition ids.
attention_mask
,
loss_mask
,
position_ids
=
get_ltor_masks_and_position_ids
(
attention_mask
,
loss_mask
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
tokens
,
args
.
eod_token
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
)
args
.
eod_mask_loss
,
# Convert
args
.
fp16
)
if
args
.
fp16
:
attention_mask
=
attention_mask
.
half
()
return
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
return
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
...
@@ -160,9 +159,6 @@ def get_train_val_test_data():
...
@@ -160,9 +159,6 @@ def get_train_val_test_data():
args
.
do_valid
=
flags
[
1
].
item
()
args
.
do_valid
=
flags
[
1
].
item
()
args
.
do_test
=
flags
[
2
].
item
()
args
.
do_test
=
flags
[
2
].
item
()
tokenizer
=
get_tokenizer
()
args
.
eod_token
=
tokenizer
.
eod_id
return
train_data
,
val_data
,
test_data
return
train_data
,
val_data
,
test_data
...
...
tasks/main.py
View file @
8fae1cdd
...
@@ -43,6 +43,10 @@ def get_tasks_args(parser):
...
@@ -43,6 +43,10 @@ def get_tasks_args(parser):
'for training.'
)
'for training.'
)
group
.
add_argument
(
'--valid-data'
,
nargs
=
'*'
,
default
=
None
,
group
.
add_argument
(
'--valid-data'
,
nargs
=
'*'
,
default
=
None
,
help
=
'path(s) to the validation data.'
)
help
=
'path(s) to the validation data.'
)
group
.
add_argument
(
'--overlapping-eval'
,
type
=
int
,
default
=
32
,
help
=
'Sliding window for overlapping evaluation.'
)
group
.
add_argument
(
'--strict-lambada'
,
action
=
'store_true'
,
help
=
'Use more difficult formulation of lambada.'
)
return
parser
return
parser
...
...
tasks/zeroshot_gpt2/datasets.py
0 → 100644
View file @
8fae1cdd
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Zero-shot datasets."""
import
json
import
math
import
numpy
as
np
import
torch
from
megatron
import
get_args
from
megatron
import
get_tokenizer
from
megatron
import
print_rank_0
from
.detokenizer
import
get_detokenizer
def
build_dataset
(
task
):
"""Helper function to select and build dataset."""
if
task
==
'LAMBADA'
:
return
_build_lambada_dataset
()
if
task
==
'WIKITEXT103'
:
return
_build_wikitext103_dataset
()
raise
NotImplementedError
(
'dataset for {} task is not '
'implemented.'
.
format
(
task
))
class
_LMDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
tokens
,
seq_len
,
pad_idx
,
num_original_tokens
,
num_tokenized_tokens
,
overalapping_eval
=
None
):
self
.
tokens
=
tokens
self
.
seq_len
=
seq_len
self
.
pad_idx
=
pad_idx
self
.
overalapping_eval
=
overalapping_eval
if
self
.
overalapping_eval
is
None
:
self
.
overalapping_eval
=
self
.
seq_len
self
.
overalapping_eval
=
max
(
1
,
self
.
overalapping_eval
)
self
.
num_original_tokens
=
num_original_tokens
self
.
num_tokenized_tokens
=
num_tokenized_tokens
self
.
total_targets
=
len
(
self
.
tokens
)
-
1
# remove first sequence tokens
targets
=
max
(
self
.
total_targets
-
self
.
overalapping_eval
,
0
)
self
.
total_sequences
=
max
(
math
.
ceil
(
targets
/
self
.
overalapping_eval
)
+
1
,
1
)
def
__len__
(
self
):
return
self
.
total_sequences
def
__getitem__
(
self
,
idx
):
start_idx
=
idx
*
self
.
overalapping_eval
end_idx
=
start_idx
+
self
.
seq_len
tokens
=
self
.
tokens
[
start_idx
:
end_idx
+
1
]
num_tokens
=
len
(
tokens
)
pad_mask
=
[
1
]
*
num_tokens
if
num_tokens
<
self
.
seq_len
+
1
:
num_pad
=
(
self
.
seq_len
+
1
-
num_tokens
)
pad_mask
+=
[
0
]
*
(
num_pad
)
tokens
+=
[
self
.
pad_idx
]
*
num_pad
pad_mask
=
np
.
array
(
pad_mask
[
1
:])
if
self
.
overalapping_eval
!=
self
.
seq_len
and
idx
!=
0
:
pad_mask
[:
-
self
.
overalapping_eval
]
*=
0
return
{
'text'
:
np
.
array
(
tokens
),
'pad_mask'
:
pad_mask
}
class
_LambadaDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
path
,
pad_idx
,
tokenizer
,
seq_len
,
strict
=
False
):
print_rank_0
(
'> building lambada dataset from {} ...'
.
format
(
path
))
self
.
seq_len
=
seq_len
self
.
pad_idx
=
pad_idx
self
.
tokenizer
=
tokenizer
self
.
strict
=
strict
self
.
tokens
=
[]
self
.
labels
=
[]
with
open
(
path
,
'r'
)
as
f
:
for
line
in
f
.
readlines
():
text
=
json
.
loads
(
line
)[
'text'
]
tokens
,
labels
=
self
.
get_tokens
(
text
)
self
.
tokens
.
append
(
tokens
)
self
.
labels
.
append
(
labels
)
def
get_tokens
(
self
,
text
):
if
not
self
.
strict
:
tokens
=
self
.
tokenizer
.
tokenize
(
text
)
return
tokens
[:
-
1
],
[
tokens
[
-
1
]]
last_token
=
text
.
split
()[
-
1
]
start_idx
=
text
.
rfind
(
last_token
)
beginning_tokens
=
self
.
tokenizer
.
tokenize
(
text
[:
start_idx
].
strip
())
last_token
=
self
.
tokenizer
.
tokenize
(
' '
+
last_token
)
return
beginning_tokens
,
last_token
def
__len__
(
self
):
return
len
(
self
.
tokens
)
def
__getitem__
(
self
,
idx
):
tokens
=
self
.
tokens
[
idx
]
num_tokens
=
len
(
tokens
)
pad_mask
=
[
0
]
*
num_tokens
labels
=
self
.
labels
[
idx
]
pad_mask
+=
[
1
]
*
len
(
labels
)
tokens
=
tokens
+
labels
num_tokens
=
len
(
tokens
)
if
num_tokens
<
self
.
seq_len
+
1
:
num_pad
=
(
self
.
seq_len
+
1
-
num_tokens
)
pad_mask
+=
[
0
]
*
(
num_pad
)
tokens
+=
[
self
.
pad_idx
]
*
num_pad
pad_mask
=
np
.
array
(
pad_mask
[
1
:])
return
{
'text'
:
np
.
array
(
tokens
),
'pad_mask'
:
pad_mask
}
def
_build_lambada_dataset
():
"""Build lambada dataset."""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
assert
len
(
args
.
valid_data
)
==
1
val_dataset
=
_LambadaDataset
(
args
.
valid_data
,
tokenizer
.
eod
,
tokenizer
,
args
.
seq_length
,
args
.
strict_lambada
)
print_rank_0
(
' > found {} samples.'
.
format
(
len
(
val_dataset
)))
return
val_dataset
def
_build_wikitext103_dataset
():
""""""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
assert
len
(
args
.
valid_data
)
==
1
with
open
(
args
.
valid_data
,
"rb"
)
as
reader
:
entire_data
=
reader
.
read
().
decode
(
'utf-8'
)
num_original_tokens
=
len
(
entire_data
.
strip
().
split
(
" "
))
entire_data
=
get_detokenizer
(
args
.
valid_data
)(
entire_data
)
tokenized_data
=
tokenizer
.
tokenize
(
entire_data
)
num_tokenized_tokens
=
len
(
tokenized_data
)
val_dataset
=
_LMDataset
(
tokenized_data
,
args
.
seq_length
,
tokenizer
.
eod
,
num_original_tokens
,
num_tokenized_tokens
,
args
.
overlapping_eval
)
print_rank_0
(
' > number of original tokens: {}, number of detokenized '
'tokens: {}'
.
format
(
num_original_tokens
,
num_tokenized_tokens
))
return
val_dataset
detokenizer.py
→
tasks/zeroshot_gpt2/
detokenizer.py
100755 → 100644
View file @
8fae1cdd
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Detokenization."""
import
re
import
re
def
ptb_detokenizer
(
string
):
def
ptb_detokenizer
(
string
):
string
=
string
.
replace
(
" '"
,
"'"
)
string
=
string
.
replace
(
" '"
,
"'"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
"
\n
"
,
"
\n
"
)
string
=
string
.
replace
(
" n't"
,
"n't"
)
string
=
string
.
replace
(
" n't"
,
"n't"
)
...
@@ -44,17 +62,22 @@ def wikitext_detokenizer(string):
...
@@ -44,17 +62,22 @@ def wikitext_detokenizer(string):
return
string
return
string
def
lambada_detokenizer
(
string
):
def
lambada_detokenizer
(
string
):
return
string
return
string
def
get_detokenizer
(
path
):
for
key
in
DETOKENIZERS
.
keys
():
if
key
in
path
:
print
(
key
)
return
DETOKENIZERS
[
key
]
DETOKENIZERS
=
{
_
DETOKENIZERS
=
{
'ptb'
:
ptb_detokenizer
,
'ptb'
:
ptb_detokenizer
,
'wikitext'
:
wikitext_detokenizer
,
'wikitext'
:
wikitext_detokenizer
,
'lambada'
:
lambada_detokenizer
,
'lambada'
:
lambada_detokenizer
,
}
}
def
get_detokenizer
(
path
):
for
key
in
DETOKENIZERS
.
keys
():
if
key
in
path
:
print
(
key
)
return
_DETOKENIZERS
[
key
]
tasks/zeroshot_gpt2/evaluate.py
0 → 100644
View file @
8fae1cdd
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPT2 zero-shot evaluation."""
import
math
import
torch
from
megatron
import
get_args
from
megatron
import
get_tokenizer
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron.checkpointing
import
load_checkpoint
from
megatron.model
import
GPT2Model
from
megatron.training
import
get_model
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
tasks.finetune_utils
import
build_data_loader
from
.dataset
import
build_dataset
def
get_model_provider
(
eval_metric
):
"""Based on evaluation metric set the parallel-output flag and
return the model provider."""
def
model_provider
():
"""Build the model."""
if
eval_metric
==
'loss'
:
parallel_output
=
True
elif
eval_metric
==
'accuracy'
:
parallel_output
=
False
else
:
raise
NotImplementedError
(
'output type for {} evaluation metric '
'is not supported.'
.
format
(
eval_metric
))
print_rank_0
(
'building GPT2 model ...'
)
model
=
GPT2Model
(
num_tokentypes
=
0
,
parallel_output
=
parallel_output
)
return
model
return
model_provider
def
process_batch
(
batch
):
"""Process batch and produce inputs for the model."""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
loss_mask
=
batch
[
'pad_mask'
].
long
().
cuda
().
contiguous
().
byte
()
tokens_
=
batch
[
'text'
].
long
().
cuda
().
contiguous
()
labels
=
tokens_
[:,
1
:].
contiguous
()
tokens
=
tokens_
[:,
:
-
1
].
contiguous
()
# Get the masks and postition ids.
attention_mask
,
_
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
)
return
tokens
,
labels
,
attention_mask
,
position_ids
,
loss_mask
def
forward_step
(
batch
,
model
,
eval_metric
):
"""Forward step."""
# Get the batch.
tokens
,
labels
,
attention_mask
,
position_ids
,
loss_mask
=
process_batch
(
batch
)
# Forward model.
output
=
model
(
tokens
,
position_ids
,
attention_mask
)
# For loss, return the unreduced loss.
if
eval_metric
==
'loss'
:
losses
=
mpu
.
vocab_parallel_cross_entropy
(
output
.
contiguous
().
float
(),
labels
.
contiguous
())
loss
=
torch
.
sum
(
losses
.
view
(
-
1
)
*
loss_mask
.
contiguous
().
view
(
-
1
).
float
())
return
loss
# For accuracy, return the number of correctly predicted samples.
if
eval_metric
==
'accuracy'
:
outputs
=
torch
.
argmax
(
output
,
-
1
)
correct
=
(
outputs
==
labels
).
float
()
correct
[(
1
-
loss_mask
).
bool
()]
=
1
correct
=
correct
.
prod
(
-
1
)
return
correct
.
sum
()
raise
NotImplementedError
(
'forward method for evaluation metric {} '
'is not implemented.'
.
format
(
eval_metric
))
def
evaluate
(
data_loader
,
model
,
eval_metric
):
"""Evaluation."""
args
=
get_args
()
# Turn on evaluation mode which disables dropout.
model
.
eval
()
total_output
=
0.0
with
torch
.
no_grad
():
# For all the batches in the dataset.
for
iteration
,
batch
in
enumerate
(
data_loader
):
if
iteration
%
args
.
log_interval
==
0
:
print_rank_0
(
'> working on iteration: {}'
.
format
(
iteration
))
# Forward evaluation.
output
=
forward_step
(
batch
,
model
,
eval_metric
)
# Reduce across processes.
torch
.
distributed
.
all_reduce
(
output
,
group
=
mpu
.
get_data_parallel_group
())
total_output
+=
output
return
total_output
def
evaluate_and_print_results
(
task
,
data_loader
,
model
,
eval_metric
):
"""Evaluate and print results on screen."""
# Evaluate and get results.
output
=
evaluate
(
data_loader
,
model
,
eval_metric
)
string
=
' validation results on {} | '
.
format
(
task
)
if
eval_metric
==
'loss'
:
num_tokenized_tokens
=
data_loader
.
dataset
.
num_tokenized_tokens
num_original_tokens
=
data_loader
.
dataset
.
num_original_tokens
val_loss
=
output
/
(
num_tokenized_tokens
-
1
)
ppl
=
math
.
exp
(
min
(
20
,
val_loss
))
token_ratio
=
(
num_tokenized_tokens
-
1
)
/
(
num_original_tokens
-
1
)
adjusted_ppl
=
math
.
exp
(
min
(
20
,
val_loss
*
token_ratio
))
string
+=
'avg loss: {:.4E} | '
.
format
(
val_loss
)
string
+=
'ppl: {:.4E} | '
.
format
(
ppl
)
string
+=
'adjusted ppl: {:.4E} | '
.
format
(
adjusted_ppl
)
string
+=
'token ratio: {} |'
.
format
(
token_ratio
)
elif
eval_metric
==
'accuracy'
:
num_examples
=
len
(
data_loader
.
dataset
)
acc
=
output
/
num_examples
string
+=
'number correct: {:.4E} | '
.
format
(
output
)
string
+=
'total examples: {:.4E} | '
.
format
(
num_examples
)
string
+=
'avg accuracy: {:.4E}'
.
format
(
acc
)
else
:
raise
NotImplementedError
(
'evaluation method for {} metric is not '
'implemented yet.'
.
format
(
eval_metric
))
length
=
len
(
string
)
+
1
print_rank_0
(
'-'
*
length
)
print_rank_0
(
string
)
print_rank_0
(
'-'
*
length
)
def
main
():
"""Main program."""
args
=
get_args
()
if
args
.
task
==
'LAMBADA'
:
eval_metric
=
'accuracy'
elif
args
.
task
==
'WIKITEXT103'
:
eval_metric
=
'loss'
else
:
raise
NotImplementedError
(
'{} task is not implemented.'
.
format
(
args
.
task
))
# Set up model and load checkpoint.
model
=
get_model
(
get_model_provider
(
eval_metric
))
if
args
.
load
is
not
None
:
_
=
load_checkpoint
(
model
,
None
,
None
)
# Data stuff.
dataset
=
build_dataset
(
args
.
task
)
dataloader
=
build_data_loader
(
dataset
,
args
.
batch_size
,
args
.
num_workers
,
drop_last
=
False
)
# Run evaluation.
evaluate_and_print_results
(
args
.
task
,
dataloader
,
model
,
eval_metric
)
print_rank_0
(
'done :-)'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment