Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
24684cbb
Commit
24684cbb
authored
Sep 29, 2021
by
mshoeybi
Browse files
added BOS
parent
ff2f0a05
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
57 additions
and
10 deletions
+57
-10
megatron/inference/api.py
megatron/inference/api.py
+48
-6
megatron/inference/tokenization.py
megatron/inference/tokenization.py
+9
-4
No files found.
megatron/inference/api.py
View file @
24684cbb
...
...
@@ -18,9 +18,48 @@
import
torch
from
megatron
import
mpu
from
.communication
import
broadcast_float_list
from
.generation
import
generate_tokens_probs_and_return_on_first_stage
from
.tokenization
import
tokenize_prompts
from
.tokenization
import
(
tokenize_prompts
,
detokenize_generations
)
def
generate_and_post_process
(
model
,
prompts
=
None
,
tokens_to_generate
=
0
,
return_output_log_probs
=
False
,
return_all_log_probs
=
False
,
temperature
=
1.0
,
add_BOS
=
False
):
"""TO DO ..."""
# Main inference.
tokens
,
lengths
,
output_log_probs
,
all_log_probs
=
generate
(
model
,
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
,
return_output_log_probs
=
return_output_log_probs
,
return_all_log_probs
=
return_all_log_probs
,
temperature
=
temperature
,
add_BOS
=
add_BOS
)
# Only post-process on first stage.
if
mpu
.
is_pipeline_first_stage
():
tokens
,
prompts_plus_generations
,
prompts_plus_generations_segments
=
\
detokenize_generations
(
tokens
,
lengths
,
True
)
if
return_output_log_probs
:
output_log_probs
=
output_log_probs
.
cpu
().
numpy
().
tolist
()
if
return_all_log_probs
:
all_log_probs
=
all_log_probs
.
cpu
().
numpy
()
#.tolist()
return
prompts_plus_generations
,
prompts_plus_generations_segments
,
\
output_log_probs
,
all_log_probs
,
tokens
return
None
def
generate
(
model
,
...
...
@@ -28,24 +67,27 @@ def generate(model,
tokens_to_generate
=
0
,
return_output_log_probs
=
False
,
return_all_log_probs
=
False
,
temperature
=
1.0
):
temperature
=
1.0
,
add_BOS
=
False
):
"""TO DO ..."""
# Make sure input params are avaialble to all ranks.
values
=
[
tokens_to_generate
,
return_output_log_probs
,
return_all_log_probs
,
temperature
]
values_float_tensor
=
broadcast_float_list
(
4
,
float_list
=
values
)
return_all_log_probs
,
temperature
,
add_BOS
]
values_float_tensor
=
broadcast_float_list
(
5
,
float_list
=
values
)
tokens_to_generate
=
int
(
values_float_tensor
[
0
].
item
())
return_output_log_probs
=
bool
(
values_float_tensor
[
1
].
item
())
return_all_log_probs
=
bool
(
values_float_tensor
[
2
].
item
())
temperature
=
values_float_tensor
[
2
].
item
()
temperature
=
values_float_tensor
[
3
].
item
()
add_BOS
=
bool
(
values_float_tensor
[
4
].
item
())
# Tokenize prompts and get the batch.
# Note that these tensors are broadcaseted to all ranks.
if
torch
.
distributed
.
get_rank
()
==
0
:
assert
prompts
is
not
None
assert
tokens_to_generate
>
0
context_tokens_tensor
,
context_length_tensor
=
tokenize_prompts
(
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
)
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
,
add_BOS
=
add_BOS
)
# Main inference function.
# Note that the outputs are available on the first stage.
...
...
megatron/inference/tokenization.py
View file @
24684cbb
...
...
@@ -57,7 +57,8 @@ def detokenize_generations(tokens_gpu_tensor,
return
tokens
,
prompts_plus_generations
def
tokenize_prompts
(
prompts
=
None
,
tokens_to_generate
=
None
,
rank
=
0
):
def
tokenize_prompts
(
prompts
=
None
,
tokens_to_generate
=
None
,
add_BOS
=
None
,
rank
=
0
):
"""Tokenize prompts and make them avaiable on all ranks."""
# On all ranks set to None so we can pass them to functions
...
...
@@ -71,7 +72,7 @@ def tokenize_prompts(prompts=None, tokens_to_generate=None, rank=0):
assert
tokens_to_generate
is
not
None
# Tensor of tokens padded and their unpadded length.
prompts_tokens_cuda_long_tensor
,
prompts_length_cuda_long_tensor
=
\
_tokenize_prompts_and_batch
(
prompts
,
tokens_to_generate
)
_tokenize_prompts_and_batch
(
prompts
,
tokens_to_generate
,
add_BOS
)
# We need the sizes of these tensors for the boradcast
sizes_list
=
[
prompts_tokens_cuda_long_tensor
.
size
(
0
),
# Batch size
prompts_tokens_cuda_long_tensor
.
size
(
1
)]
# Sequence lenght
...
...
@@ -91,7 +92,7 @@ def tokenize_prompts(prompts=None, tokens_to_generate=None, rank=0):
return
prompts_tokens_cuda_long_tensor
,
prompts_length_cuda_long_tensor
def
_tokenize_prompts_and_batch
(
prompts
,
tokens_to_generate
):
def
_tokenize_prompts_and_batch
(
prompts
,
tokens_to_generate
,
add_BOS
):
"""Given a set of prompts and number of tokens to generate:
- tokenize prompts
- set the sequence length to be the max of length of prompts
...
...
@@ -102,7 +103,11 @@ def _tokenize_prompts_and_batch(prompts, tokens_to_generate):
# Tokenize all the prompts.
tokenizer
=
get_tokenizer
()
prompts_tokens
=
[
tokenizer
.
tokenize
(
prompt
)
for
prompt
in
prompts
]
if
add_BOS
:
prompts_tokens
=
[[
tokenizer
.
eod
]
+
tokenizer
.
tokenize
(
prompt
)
for
prompt
in
prompts
]
else
:
prompts_tokens
=
[
tokenizer
.
tokenize
(
prompt
)
for
prompt
in
prompts
]
# Now we have a list of list of tokens which each list has a different
# size. We want to extend this list to:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment