Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
716a3243
Commit
716a3243
authored
Apr 22, 2021
by
Jared Casper
Browse files
Merge branch 'main_generate' into 'main'
lm evaluation See merge request ADLR/megatron-lm!262
parents
7a5768ac
e5ec27d7
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
38 additions
and
1 deletion
+38
-1
megatron/text_generation_utils.py
megatron/text_generation_utils.py
+38
-1
No files found.
megatron/text_generation_utils.py
View file @
716a3243
...
@@ -190,6 +190,37 @@ def generate_samples_input_from_file(model):
...
@@ -190,6 +190,37 @@ def generate_samples_input_from_file(model):
raw_text
=
None
raw_text
=
None
context_count
+=
1
context_count
+=
1
# We added this function to support the tasks evaluation such as squad
# and drop in the https://github.com/EleutherAI/lm-evaluation-harness
# codebase. The lm-evaluation-harness code can now call this function
# similar to their current generate function call used for gpt style models.
def
generate_samples_eval
(
model
,
context
,
max_gen_length
,
eos_token_id
):
# Generate samples for lm evaluation
# NEED TO THINK ABOUT eos token
args
=
get_args
()
tokenizer
=
get_tokenizer
()
raw_text_len
=
len
(
context
)
model
.
eval
()
context_tokens
=
tokenizer
.
tokenize
(
context
)
args
.
out_seq_length
=
max_gen_length
+
len
(
context_tokens
)
args
.
eos_id
=
eos_token_id
with
torch
.
no_grad
():
token_stream
=
get_token_stream
(
model
,
[
context_tokens
])
for
counter
,
decode_tokens
in
enumerate
(
token_stream
):
if
counter
==
args
.
out_seq_length
:
break
decode_tokens
,
_
=
decode_tokens
decode_tokens
=
decode_tokens
[
0
].
cpu
().
numpy
().
tolist
()
trim_decode_tokens
=
tokenizer
.
detokenize
(
decode_tokens
)[
raw_text_len
:]
return
trim_decode_tokens
def
generate_samples_interactive
(
model
,
print_frequency
=
24
):
def
generate_samples_interactive
(
model
,
print_frequency
=
24
):
...
@@ -438,7 +469,13 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
...
@@ -438,7 +469,13 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
with
torch
.
no_grad
():
context_length
=
context_lengths
.
min
().
item
()
context_length
=
context_lengths
.
min
().
item
()
eos_id
=
tokenizer
.
eod
# added eos_id to support the function generate_samples_eval that passes
# eos_id as an argument and needs termination when that id id found.
if
hasattr
(
args
,
'eos_id'
):
eos_id
=
args
.
eos_id
else
:
eos_id
=
tokenizer
.
eod
counter
=
0
counter
=
0
org_context_length
=
context_length
org_context_length
=
context_length
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment