Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wxj
NeMo
Commits
bc5c7fa7
Commit
bc5c7fa7
authored
Jan 07, 2025
by
wxj
Browse files
第一次测试提交
parent
70fddd0f
Changes
290
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4348 additions
and
0 deletions
+4348
-0
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/api.py
...ore_r0.7.0.beta/megatron/inference/text_generation/api.py
+207
-0
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/beam_utils.py
...7.0.beta/megatron/inference/text_generation/beam_utils.py
+64
-0
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/communication.py
....beta/megatron/inference/text_generation/communication.py
+185
-0
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/forward_step.py
...0.beta/megatron/inference/text_generation/forward_step.py
+177
-0
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/generation.py
...7.0.beta/megatron/inference/text_generation/generation.py
+432
-0
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/sampling.py
...0.7.0.beta/megatron/inference/text_generation/sampling.py
+93
-0
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/tokenization.py
...0.beta/megatron/inference/text_generation/tokenization.py
+125
-0
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation_server.py
..._r0.7.0.beta/megatron/inference/text_generation_server.py
+241
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/__init__.py
...tron-LM-core_r0.7.0.beta/megatron/legacy/data/__init__.py
+0
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/autoaugment.py
...n-LM-core_r0.7.0.beta/megatron/legacy/data/autoaugment.py
+320
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/biencoder_dataset_utils.py
....7.0.beta/megatron/legacy/data/biencoder_dataset_utils.py
+209
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/data_samplers.py
...LM-core_r0.7.0.beta/megatron/legacy/data/data_samplers.py
+192
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/dataset_utils.py
...LM-core_r0.7.0.beta/megatron/legacy/data/dataset_utils.py
+726
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/ict_dataset.py
...n-LM-core_r0.7.0.beta/megatron/legacy/data/ict_dataset.py
+156
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/image_folder.py
...-LM-core_r0.7.0.beta/megatron/legacy/data/image_folder.py
+302
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/multimodal_dataset.py
...re_r0.7.0.beta/megatron/legacy/data/multimodal_dataset.py
+54
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/orqa_wiki_dataset.py
...ore_r0.7.0.beta/megatron/legacy/data/orqa_wiki_dataset.py
+193
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/realm_dataset_utils.py
...e_r0.7.0.beta/megatron/legacy/data/realm_dataset_utils.py
+199
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/realm_index.py
...n-LM-core_r0.7.0.beta/megatron/legacy/data/realm_index.py
+224
-0
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/vit_dataset.py
...n-LM-core_r0.7.0.beta/megatron/legacy/data/vit_dataset.py
+249
-0
No files found.
Too many changes to show.
To preserve performance only
290 of 290+
files are displayed.
Plain diff
Email patch
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/api.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Inference API."""
import
torch
from
megatron.core
import
mpu
from
.communication
import
broadcast_float_list
from
.generation
import
(
generate_tokens_probs_and_return_on_first_stage
,
score_and_return_on_first_stage
,
beam_search_and_return_on_first_stage
)
from
.tokenization
import
(
tokenize_prompts
,
detokenize_generations
)
def
generate_and_post_process
(
model
,
prompts
=
None
,
tokens_to_generate
=
0
,
return_output_log_probs
=
False
,
top_k_sampling
=
0
,
top_p_sampling
=
0.0
,
top_p_decay
=
0.0
,
top_p_bound
=
0.0
,
temperature
=
1.0
,
add_BOS
=
False
,
use_eod_token_for_early_termination
=
True
,
stop_on_double_eol
=
False
,
stop_on_eol
=
False
,
prevent_newline_after_colon
=
False
,
random_seed
=-
1
,
return_logits
=
False
):
"""Run inference and post-process outputs, i.e., detokenize,
move to cpu and convert to list."""
# Main inference.
tokens
,
lengths
,
output_log_probs
,
logits
=
generate
(
model
,
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
,
return_output_log_probs
=
return_output_log_probs
,
top_k_sampling
=
top_k_sampling
,
top_p_sampling
=
top_p_sampling
,
top_p_decay
=
top_p_decay
,
top_p_bound
=
top_p_bound
,
temperature
=
temperature
,
add_BOS
=
add_BOS
,
use_eod_token_for_early_termination
=
use_eod_token_for_early_termination
,
stop_on_double_eol
=
stop_on_double_eol
,
stop_on_eol
=
stop_on_eol
,
prevent_newline_after_colon
=
prevent_newline_after_colon
,
random_seed
=
random_seed
)
# Only post-process on first stage.
if
mpu
.
is_pipeline_first_stage
():
tokens
,
prompts_plus_generations
,
prompts_plus_generations_segments
=
\
detokenize_generations
(
tokens
,
lengths
,
True
)
if
return_output_log_probs
:
output_log_probs
=
output_log_probs
.
cpu
().
numpy
().
tolist
()
for
i
,
(
prob
,
seg
)
in
enumerate
(
zip
(
output_log_probs
,
prompts_plus_generations_segments
)):
output_log_probs
[
i
]
=
prob
[:
len
(
seg
)
-
1
]
if
return_logits
:
assert
(
tokens_to_generate
==
0
)
assert
(
mpu
.
get_pipeline_model_parallel_world_size
()
==
1
)
return
prompts_plus_generations
,
prompts_plus_generations_segments
,
\
output_log_probs
,
tokens
,
logits
else
:
return
prompts_plus_generations
,
prompts_plus_generations_segments
,
\
output_log_probs
,
tokens
return
None
def
generate
(
model
,
prompts
=
None
,
tokens_to_generate
=
0
,
return_output_log_probs
=
False
,
top_k_sampling
=
0
,
top_p_sampling
=
0.0
,
top_p_decay
=
0.0
,
top_p_bound
=
0.0
,
temperature
=
1.0
,
add_BOS
=
False
,
use_eod_token_for_early_termination
=
True
,
stop_on_double_eol
=
False
,
stop_on_eol
=
False
,
prevent_newline_after_colon
=
False
,
random_seed
=-
1
):
"""Given prompts and input parameters, run inference and return:
tokens: prompts plus the generated tokens.
lengths: length of the prompt + generations. Note that we can
discard tokens in the tokens tensor that are after the
corresponding length.
output_log_probs: log probs of the tokens.
"""
# Make sure input params are avaialble to all ranks.
values
=
[
tokens_to_generate
,
return_output_log_probs
,
top_k_sampling
,
top_p_sampling
,
top_p_decay
,
top_p_bound
,
temperature
,
add_BOS
,
use_eod_token_for_early_termination
,
stop_on_double_eol
,
stop_on_eol
,
prevent_newline_after_colon
,
random_seed
]
values_float_tensor
=
broadcast_float_list
(
len
(
values
),
float_list
=
values
)
tokens_to_generate
=
int
(
values_float_tensor
[
0
].
item
())
return_output_log_probs
=
bool
(
values_float_tensor
[
1
].
item
())
top_k_sampling
=
int
(
values_float_tensor
[
2
].
item
())
top_p_sampling
=
values_float_tensor
[
3
].
item
()
top_p_decay
=
values_float_tensor
[
4
].
item
()
top_p_bound
=
values_float_tensor
[
5
].
item
()
temperature
=
values_float_tensor
[
6
].
item
()
add_BOS
=
bool
(
values_float_tensor
[
7
].
item
())
use_eod_token_for_early_termination
=
bool
(
values_float_tensor
[
8
].
item
())
stop_on_double_eol
=
bool
(
values_float_tensor
[
9
].
item
())
stop_on_eol
=
bool
(
values_float_tensor
[
10
].
item
())
prevent_newline_after_colon
=
bool
(
values_float_tensor
[
11
].
item
())
random_seed
=
int
(
values_float_tensor
[
12
].
item
())
if
random_seed
!=
-
1
:
torch
.
random
.
manual_seed
(
random_seed
)
# Tokenize prompts and get the batch.
# Note that these tensors are broadcaseted to all ranks.
if
torch
.
distributed
.
get_rank
()
==
0
:
assert
prompts
is
not
None
context_tokens_tensor
,
context_length_tensor
=
tokenize_prompts
(
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
,
add_BOS
=
add_BOS
)
if
tokens_to_generate
==
0
:
return
score_and_return_on_first_stage
(
model
,
context_tokens_tensor
,
context_length_tensor
)
# Main inference function.
# Note that the outputs are available on the first stage.
return
generate_tokens_probs_and_return_on_first_stage
(
model
,
context_tokens_tensor
,
context_length_tensor
,
return_output_log_probs
=
return_output_log_probs
,
top_k
=
top_k_sampling
,
top_p
=
top_p_sampling
,
top_p_decay
=
top_p_decay
,
top_p_bound
=
top_p_bound
,
temperature
=
temperature
,
use_eod_token_for_early_termination
=
use_eod_token_for_early_termination
,
stop_on_double_eol
=
stop_on_double_eol
,
stop_on_eol
=
stop_on_eol
,
prevent_newline_after_colon
=
prevent_newline_after_colon
)
def
beam_search_and_post_process
(
model
,
prompts
=
None
,
tokens_to_generate
=
0
,
beam_size
=
0
,
add_BOS
=
False
,
stop_token
=
50256
,
num_return_gen
=
1
,
length_penalty
=
1
,
prevent_newline_after_colon
=
False
):
"""Run beam search and post-process outputs, i.e., detokenize,
move to cpu and convert to list."""
# Main inference.
tokens
,
scores
=
beam_search
(
model
,
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
,
beam_size
=
beam_size
,
add_BOS
=
add_BOS
,
stop_token
=
stop_token
,
num_return_gen
=
num_return_gen
,
length_penalty
=
length_penalty
,
prevent_newline_after_colon
=
prevent_newline_after_colon
)
# Only post-process on first stage.
if
mpu
.
is_pipeline_first_stage
():
lengths
=
tokens
.
size
(
1
)
*
torch
.
ones
(
beam_size
,
dtype
=
torch
.
int64
,
device
=
torch
.
cuda
.
current_device
())
tokens
,
prompts_plus_generations
,
prompts_plus_generations_segments
=
detokenize_generations
(
tokens
,
lengths
,
True
)
scores
=
scores
.
cpu
().
numpy
().
tolist
()
return
prompts_plus_generations
,
prompts_plus_generations_segments
,
scores
return
None
def
beam_search
(
model
,
prompts
=
None
,
tokens_to_generate
=
0
,
beam_size
=
0
,
add_BOS
=
False
,
stop_token
=
50256
,
num_return_gen
=
1
,
length_penalty
=
1
,
prevent_newline_after_colon
=
False
):
# Make sure input params are avaialble to all ranks.
values
=
[
tokens_to_generate
,
beam_size
,
add_BOS
,
stop_token
,
num_return_gen
,
length_penalty
,
prevent_newline_after_colon
]
values_float_tensor
=
broadcast_float_list
(
len
(
values
),
float_list
=
values
)
tokens_to_generate
=
int
(
values_float_tensor
[
0
].
item
())
beam_size
=
int
(
values_float_tensor
[
1
].
item
())
add_BOS
=
bool
(
values_float_tensor
[
2
].
item
())
stop_token
=
int
(
values_float_tensor
[
3
].
item
())
num_return_gen
=
int
(
values_float_tensor
[
4
].
item
())
length_penalty
=
values_float_tensor
[
5
].
item
()
prevent_newline_after_colon
=
values_float_tensor
[
6
].
item
()
context_tokens_tensor
,
context_length_tensor
=
tokenize_prompts
(
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
,
add_BOS
=
add_BOS
)
return
beam_search_and_return_on_first_stage
(
model
,
context_tokens_tensor
,
context_length_tensor
,
beam_size
,
stop_token
=
stop_token
,
num_return_gen
=
num_return_gen
,
length_penalty
=
length_penalty
,
prevent_newline_after_colon
=
prevent_newline_after_colon
)
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/beam_utils.py
0 → 100644
View file @
bc5c7fa7
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## from huggingface beam search
class
BeamHypotheses
(
object
):
def
__init__
(
self
,
num_beams
,
length_penalty
=
1.0
,
early_stopping
=
False
):
"""
Initialize n-best list of hypotheses.
"""
self
.
length_penalty
=
length_penalty
self
.
early_stopping
=
early_stopping
self
.
num_beams
=
num_beams
self
.
beams
=
[]
self
.
worst_score
=
1e9
def
__len__
(
self
):
"""
Number of hypotheses in the list.
"""
return
len
(
self
.
beams
)
def
add
(
self
,
hyp
,
sum_logprobs
,
length
):
"""
Add a new hypothesis to the list.
"""
score
=
sum_logprobs
/
length
**
self
.
length_penalty
if
len
(
self
)
<
self
.
num_beams
or
score
>
self
.
worst_score
:
self
.
beams
.
append
((
score
,
hyp
))
if
len
(
self
)
>
self
.
num_beams
:
sorted_scores
=
sorted
([(
s
,
idx
)
for
idx
,
(
s
,
_
)
in
enumerate
(
self
.
beams
)])
del
self
.
beams
[
sorted_scores
[
0
][
1
]]
self
.
worst_score
=
sorted_scores
[
1
][
0
]
else
:
self
.
worst_score
=
min
(
score
,
self
.
worst_score
)
def
is_done
(
self
,
best_sum_logprobs
,
cur_len
):
"""
If there are enough hypotheses and that none of the hypotheses being generated
can become better than the worst one in the heap, then we are done with this sentence.
"""
if
len
(
self
)
<
self
.
num_beams
:
return
False
elif
self
.
early_stopping
:
return
True
else
:
cur_score
=
best_sum_logprobs
/
cur_len
**
self
.
length_penalty
ret
=
self
.
worst_score
>=
cur_score
return
ret
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/communication.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Communications utilities."""
import
torch
from
megatron.core
import
mpu
# TODO: use functions from megatron/p2p
def
recv_from_prev_pipeline_rank_
(
recv_buffer
=
None
):
"""Receive from previous pipeline stage and update the
input buffer inplace."""
if
not
mpu
.
is_pipeline_first_stage
():
assert
recv_buffer
is
not
None
recv_prev_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
irecv
,
recv_buffer
,
mpu
.
get_pipeline_model_parallel_prev_rank
())
reqs
=
torch
.
distributed
.
batch_isend_irecv
([
recv_prev_op
])
for
req
in
reqs
:
req
.
wait
()
# To protect against race condition when using batch_isend_irecv().
torch
.
cuda
.
synchronize
()
# TODO: use functions from megatron/p2p
def
send_to_next_pipeline_rank
(
tensor
=
None
):
"""Send output to the next pipeline stage."""
if
not
mpu
.
is_pipeline_last_stage
():
assert
tensor
is
not
None
send_next_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
isend
,
tensor
,
mpu
.
get_pipeline_model_parallel_next_rank
())
reqs
=
torch
.
distributed
.
batch_isend_irecv
([
send_next_op
])
for
req
in
reqs
:
req
.
wait
()
# To protect against race condition when using batch_isend_irecv().
torch
.
cuda
.
synchronize
()
def
_is_cuda
(
tensor
):
"""Check if a tensor is not none and is cuda."""
assert
tensor
is
not
None
assert
tensor
.
is_cuda
def
_is_cuda_contiguous
(
tensor
):
"""Check if a tensor is not none, is cuda, and is contiguous."""
_is_cuda
(
tensor
)
assert
tensor
.
is_contiguous
()
def
broadcast_from_last_pipeline_stage
(
size
,
dtype
,
tensor
=
None
):
"""Broadcast a tensor from last pipeline stage to all ranks."""
is_last_stage
=
mpu
.
is_pipeline_last_stage
()
# If first stage and last state are the same, then there is no
# pipeline parallelism and no need to communicate.
if
mpu
.
is_pipeline_first_stage
()
and
is_last_stage
:
return
tensor
if
is_last_stage
:
_is_cuda_contiguous
(
tensor
)
else
:
tensor
=
torch
.
empty
(
size
,
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
# Get the group and corresponding source rank.
src
=
mpu
.
get_pipeline_model_parallel_last_rank
()
group
=
mpu
.
get_pipeline_model_parallel_group
()
torch
.
distributed
.
broadcast
(
tensor
,
src
,
group
)
return
tensor
def
broadcast_from_last_to_first_pipeline_stage
(
size
,
dtype
,
tensor
=
None
):
"""Broadcast tensor values from last stage into the first stage."""
is_last_stage
=
mpu
.
is_pipeline_last_stage
()
is_first_stage
=
mpu
.
is_pipeline_first_stage
()
# If first stage and last state are the same, then there is no
# pipeline parallelism and no need to communicate.
if
is_first_stage
and
is_last_stage
:
return
tensor
# Only first and last stage pipeline stages need to be involved.
if
is_last_stage
or
is_first_stage
:
if
is_last_stage
:
_is_cuda_contiguous
(
tensor
)
else
:
tensor
=
torch
.
empty
(
size
,
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
src
=
mpu
.
get_pipeline_model_parallel_last_rank
()
group
=
mpu
.
get_embedding_group
()
# Broadcast from last stage into the first stage.
torch
.
distributed
.
broadcast
(
tensor
,
src
,
group
)
else
:
tensor
=
None
return
tensor
def
copy_from_last_to_first_pipeline_stage
(
size
,
dtype
,
tensor
=
None
):
"""Copy tensor values from last stage into the first stage.
Note that the input tensor is updated in place."""
is_last_stage
=
mpu
.
is_pipeline_last_stage
()
is_first_stage
=
mpu
.
is_pipeline_first_stage
()
# If first stage and last state are the same, then there is no
# pipeline parallelism and no need to communicate.
if
is_first_stage
and
is_last_stage
:
return
# Only first and last stage pipeline stages need to be involved.
if
is_last_stage
or
is_first_stage
:
_is_cuda
(
tensor
)
is_contiguous
=
tensor
.
is_contiguous
()
src
=
mpu
.
get_pipeline_model_parallel_last_rank
()
group
=
mpu
.
get_embedding_group
()
if
is_contiguous
:
tensor_
=
tensor
else
:
if
is_last_stage
:
tensor_
=
tensor
.
contiguous
()
else
:
tensor_
=
torch
.
empty
(
size
,
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
# Broadcast from last stage into the first stage.
torch
.
distributed
.
broadcast
(
tensor_
,
src
,
group
)
# Update the first stage tensor
if
is_first_stage
and
not
is_contiguous
:
tensor
[...]
=
tensor_
def
broadcast_tensor
(
size
,
dtype
,
tensor
=
None
,
rank
=
0
):
""" Given size and type of a tensor on all ranks and the tensor value
only on a specific rank, broadcast from that rank to all other ranks.
"""
if
torch
.
distributed
.
get_rank
()
==
rank
:
_is_cuda_contiguous
(
tensor
)
else
:
tensor
=
torch
.
empty
(
size
,
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
torch
.
distributed
.
broadcast
(
tensor
,
rank
)
return
tensor
def
broadcast_list
(
size
,
dtype
,
list_values
=
None
,
rank
=
0
):
"""Broadcast a list of values with a given type."""
tensor
=
None
if
torch
.
distributed
.
get_rank
()
==
rank
:
tensor
=
torch
.
tensor
(
list_values
,
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
return
broadcast_tensor
(
size
,
dtype
,
tensor
=
tensor
,
rank
=
rank
)
def
broadcast_int_list
(
size
,
int_list
=
None
,
rank
=
0
):
"""Broadcast a list of interger values."""
return
broadcast_list
(
size
,
torch
.
int64
,
list_values
=
int_list
,
rank
=
rank
)
def
broadcast_float_list
(
size
,
float_list
=
None
,
rank
=
0
):
"""Broadcast a list of float values."""
return
broadcast_list
(
size
,
torch
.
float32
,
list_values
=
float_list
,
rank
=
rank
)
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/forward_step.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Forward step utilities."""
from
collections.abc
import
Iterable
import
torch
from
megatron.training
import
get_args
from
megatron.core
import
mpu
,
InferenceParams
from
.communication
import
(
send_to_next_pipeline_rank
,
recv_from_prev_pipeline_rank_
)
class
ForwardStep
:
"""Forward step function with all the communications.
We use a class here to hide the inference parameters
from the outside caller."""
def
__init__
(
self
,
model
,
max_batch_size
,
max_sequence_length
):
"""Set values so we don't need to do it multiple times."""
# Make sure model is in eval mode.
assert
not
isinstance
(
model
,
Iterable
),
\
'interleaving schedule is not supported for inference'
model
.
eval
()
self
.
model
=
model
# Initialize inference parameters.
self
.
inference_params
=
InferenceParams
(
max_batch_size
,
max_sequence_length
)
# Pipelining arguments.
args
=
get_args
()
self
.
pipeline_size_larger_than_one
=
(
args
.
pipeline_model_parallel_size
>
1
)
# Threshold of pipelining.
self
.
pipelining_batch_x_seqlen
=
\
args
.
inference_batch_times_seqlen_threshold
def
__call__
(
self
,
tokens
,
position_ids
,
attention_mask
):
"""Invocation of the forward methods. Note that self.inference_params
is being modified by the forward step."""
# Pipelining case.
if
self
.
pipeline_size_larger_than_one
:
current_batch_x_seqlen
=
tokens
.
size
(
0
)
*
tokens
.
size
(
1
)
if
current_batch_x_seqlen
>=
self
.
pipelining_batch_x_seqlen
:
micro_batch_size
=
\
max
(
1
,
self
.
pipelining_batch_x_seqlen
//
tokens
.
size
(
1
))
return
_with_pipelining_forward_step
(
self
.
model
,
tokens
,
position_ids
,
attention_mask
,
self
.
inference_params
,
micro_batch_size
)
return
_no_pipelining_forward_step
(
self
.
model
,
tokens
,
position_ids
,
attention_mask
,
self
.
inference_params
)
def
_get_recv_buffer_dtype
(
args
):
"""Receive happens between the layers."""
if
args
.
fp32_residual_connection
:
return
torch
.
float
return
args
.
params_dtype
def
_allocate_recv_buffer
(
batch_size
,
sequence_length
):
"""Receive happens between the layers with size [s, b, h]."""
if
mpu
.
is_pipeline_first_stage
():
return
None
args
=
get_args
()
recv_size
=
(
sequence_length
,
batch_size
,
args
.
hidden_size
)
return
torch
.
empty
(
recv_size
,
dtype
=
_get_recv_buffer_dtype
(
args
),
device
=
torch
.
cuda
.
current_device
())
def
_forward_step_helper
(
model
,
tokens
,
position_ids
,
attention_mask
,
inference_params
,
recv_buffer
=
None
):
"""Single forward step. Update the allocate memory flag so
only the first time the memory is allocated."""
batch_size
=
tokens
.
size
(
0
)
sequence_length
=
tokens
.
size
(
1
)
if
recv_buffer
is
None
:
recv_buffer
=
_allocate_recv_buffer
(
batch_size
,
sequence_length
)
# Receive from previous stage.
recv_from_prev_pipeline_rank_
(
recv_buffer
)
# Forward pass through the model.
model
.
set_input_tensor
(
recv_buffer
)
output_tensor
=
model
(
tokens
,
position_ids
,
attention_mask
,
inference_params
=
inference_params
)
# Send output to the next stage.
send_to_next_pipeline_rank
(
output_tensor
)
return
output_tensor
def
_no_pipelining_forward_step
(
model
,
tokens
,
position_ids
,
attention_mask
,
inference_params
,
recv_buffer
=
None
):
"""If recv_buffer is none, we will allocate one on the fly."""
# Run a simple forward pass.
output_tensor
=
_forward_step_helper
(
model
,
tokens
,
position_ids
,
attention_mask
,
inference_params
,
recv_buffer
=
recv_buffer
)
# Update the sequence length offset.
inference_params
.
sequence_len_offset
+=
tokens
.
size
(
1
)
logits
=
None
if
mpu
.
is_pipeline_last_stage
():
logits
=
output_tensor
return
logits
def
_with_pipelining_forward_step
(
model
,
tokens
,
position_ids
,
attention_mask
,
inference_params
,
micro_batch_size
):
"""No interleaving is supported."""
sequence_length
=
tokens
.
size
(
1
)
batch_size
=
tokens
.
size
(
0
)
# Divide the batch dimension into micro batches.
num_micro_batches
,
last_chunk
=
divmod
(
batch_size
,
micro_batch_size
)
if
last_chunk
>
0
:
num_micro_batches
+=
1
# Preallocate memory for output logits.
logits
=
None
if
mpu
.
is_pipeline_last_stage
():
args
=
get_args
()
logits
=
torch
.
empty
(
(
batch_size
,
sequence_length
,
args
.
padded_vocab_size
),
dtype
=
torch
.
float32
,
device
=
torch
.
cuda
.
current_device
())
# Preallocate recv buffer.
recv_buffer
=
_allocate_recv_buffer
(
micro_batch_size
,
sequence_length
)
for
micro_batch_index
in
range
(
num_micro_batches
):
# Slice among the batch dimenion.
start
=
micro_batch_index
*
micro_batch_size
end
=
min
(
start
+
micro_batch_size
,
batch_size
)
this_micro_batch_size
=
end
-
start
tokens2use
=
tokens
[
start
:
end
,
...]
position_ids2use
=
position_ids
[
start
:
end
,
...]
# Run a simple forward pass.
if
this_micro_batch_size
!=
micro_batch_size
:
recv_buffer
=
None
output
=
_forward_step_helper
(
model
,
tokens2use
,
position_ids2use
,
attention_mask
,
inference_params
,
recv_buffer
=
recv_buffer
)
# Adjust the batch size offset to account for the micro-batch.
inference_params
.
batch_size_offset
+=
this_micro_batch_size
# Copy logits.
if
mpu
.
is_pipeline_last_stage
():
logits
[
start
:
end
,
...]
=
output
# Once we are done with all the micro-batches, we can
# adjust the sequence length offset.
inference_params
.
sequence_len_offset
+=
sequence_length
# and reset the batch size offset
inference_params
.
batch_size_offset
=
0
return
logits
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/generation.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Generation utilities."""
import
torch
import
torch.nn.functional
as
F
from
megatron.training
import
get_args
,
get_tokenizer
from
megatron.core
import
mpu
from
megatron.training.utils
import
get_ltor_masks_and_position_ids
from
.communication
import
(
copy_from_last_to_first_pipeline_stage
,
broadcast_from_last_pipeline_stage
,
broadcast_from_last_to_first_pipeline_stage
)
from
.forward_step
import
ForwardStep
from
.sampling
import
sample
from
.beam_utils
import
BeamHypotheses
def
score_and_return_on_first_stage
(
model
,
tokens
,
lengths
):
"""Function for just scoring.
Args:
model: no interleaving is supported.
tokens: prompt tokens extended to be of size [b, max_prompt_length]
lengths: original prompt length, size: [b]
Note: Outside of model, other parameters only need to be available on
rank 0.
Returns:
output_log_probs: log probability of the selected tokens. size: [b, s]
"""
args
=
get_args
()
batch_size
=
tokens
.
size
(
0
)
max_prompt_length
=
lengths
.
max
().
item
()
assert
max_prompt_length
==
tokens
.
size
(
1
)
if
max_prompt_length
>
args
.
max_position_embeddings
:
raise
ValueError
(
"Length of prompt + tokens_to_generate longer than allowed"
)
if
max_prompt_length
*
batch_size
>
args
.
max_tokens_to_oom
:
raise
ValueError
(
"Too many tokens. "
+
str
(
max_prompt_length
*
batch_size
)
+
" is greater than "
+
str
(
args
.
max_tokens_to_oom
))
# forward step.
forward_step
=
ForwardStep
(
model
,
batch_size
,
max_prompt_length
)
# ===================
# Pre-allocate memory
# ===================
# Log probability of the sequence (prompt + generated tokens).
output_log_probs
=
None
output_log_probs_size
=
(
batch_size
,
max_prompt_length
-
1
)
if
mpu
.
is_pipeline_last_stage
():
output_log_probs
=
torch
.
empty
(
output_log_probs_size
,
dtype
=
torch
.
float32
,
device
=
torch
.
cuda
.
current_device
())
# =============
# Run infernece
# =============
with
torch
.
no_grad
():
attention_mask
,
position_ids
=
_build_attention_mask_and_position_ids
(
tokens
)
# logits will be meanigful only in the last pipeline stage.
logits
=
forward_step
(
tokens
,
position_ids
,
attention_mask
)
if
mpu
.
is_pipeline_last_stage
():
# Always the last stage should have an output.
assert
logits
is
not
None
log_probs
=
F
.
log_softmax
(
logits
,
dim
=
2
)
# Pick the tokens that we need to get the log
# probabilities for. Note that next input token is
# the token which we selected in the current logits,
# so shift by 1.
indices
=
torch
.
unsqueeze
(
tokens
[:,
1
:],
2
)
output_log_probs
=
torch
.
gather
(
log_probs
,
2
,
indices
).
squeeze
(
2
)
# ======================================
# Broadcast to the first pipeline stage.
# ======================================
output_log_probs
=
broadcast_from_last_to_first_pipeline_stage
(
output_log_probs_size
,
torch
.
float32
,
output_log_probs
)
return
tokens
,
lengths
,
output_log_probs
,
logits
def
generate_tokens_probs_and_return_on_first_stage
(
model
,
tokens
,
lengths
,
return_output_log_probs
=
False
,
top_k
=
0
,
top_p
=
0.0
,
top_p_decay
=
0.0
,
top_p_bound
=
0.0
,
temperature
=
1.0
,
use_eod_token_for_early_termination
=
True
,
stop_on_double_eol
=
False
,
stop_on_eol
=
False
,
prevent_newline_after_colon
=
True
):
"""Main token generation function.
Args:
model: no interleaving is supported.
tokens: prompt tokens extended to be of size [b, max-sequence-length]
lengths: original prompt length, size: [b]
return_output_log_probs: flag to calculate the log probability of
the generated tokens. Note that the log probability is the one
from the original logit.
top_k, top_p: top-k and top-p sampling parameters.
Note that top-k = 1 is gready. Also, these paramters are
exclusive meaning that:
if top-k > 0 then we expect top-p=0.
if top-p > 0 then we check for top-k=0.
temperature: sampling temperature.
use_eod_token_for_early_termination: if True, do early termination if
all the sequences have reached this token.
prevent_newline_after_colon: if True, it will disable generating new line
\n
after :
Note: Outside of model, other parameters only need to be available on
rank 0.
Returns: Note that is size is adjusted to a lower value than
max-sequence-length if generation is terminated early.
tokens: prompt and generated tokens. size: [b, :]
generated_sequence_lengths: total length (including prompt) of
the generated sequence. size: [b]
output_log_probs: log probability of the selected tokens. size: [b, s]
"""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
batch_size
=
tokens
.
size
(
0
)
min_prompt_length
=
lengths
.
min
().
item
()
max_sequence_length
=
tokens
.
size
(
1
)
if
max_sequence_length
>
args
.
max_position_embeddings
:
raise
ValueError
(
"Length of prompt + tokens_to_generate longer than allowed"
)
if
max_sequence_length
*
batch_size
>
args
.
max_tokens_to_oom
:
raise
ValueError
(
"Too many tokens. "
+
str
(
max_sequence_length
*
batch_size
)
+
" is greater than "
+
str
(
args
.
max_tokens_to_oom
))
# forward step.
forward_step
=
ForwardStep
(
model
,
batch_size
,
max_sequence_length
)
# Added termination_id to support the case that we want to terminate the
# generation once that id is generated.
if
hasattr
(
args
,
'eos_id'
):
termination_id
=
args
.
eos_id
else
:
termination_id
=
tokenizer
.
eod
# ===================
# Pre-allocate memory
# ===================
# Log probability of the sequence (prompt + generated tokens).
output_log_probs
=
None
output_log_probs_size
=
(
batch_size
,
max_sequence_length
-
1
)
# Lengths of generated seuquence including including prompts.
generated_sequence_lengths
=
None
if
mpu
.
is_pipeline_last_stage
():
if
return_output_log_probs
:
output_log_probs
=
torch
.
empty
(
output_log_probs_size
,
dtype
=
torch
.
float32
,
device
=
torch
.
cuda
.
current_device
())
generated_sequence_lengths
=
torch
.
ones
(
batch_size
,
dtype
=
torch
.
int64
,
device
=
torch
.
cuda
.
current_device
())
*
max_sequence_length
# Whether we have reached a termination id.
is_generation_done
=
torch
.
zeros
(
batch_size
,
dtype
=
torch
.
uint8
,
device
=
torch
.
cuda
.
current_device
())
# =============
# Run infernece
# =============
with
torch
.
no_grad
():
attention_mask
,
position_ids
=
_build_attention_mask_and_position_ids
(
tokens
)
prev_context_length
=
0
for
context_length
in
range
(
min_prompt_length
,
max_sequence_length
):
# Pick the slice that we need to pass through the network.
tokens2use
=
tokens
[:,
prev_context_length
:
context_length
]
positions2use
=
position_ids
[:,
prev_context_length
:
context_length
]
attention_mask2use
=
attention_mask
[
...,
prev_context_length
:
context_length
,
:
context_length
]
# logits will be meanigful only in the last pipeline stage.
logits
=
forward_step
(
tokens2use
,
positions2use
,
attention_mask2use
)
if
mpu
.
is_pipeline_last_stage
():
if
prevent_newline_after_colon
:
logits
[
tokens2use
[:,
-
1
]
==
tokenizer
.
tokenize
(
':'
)[
0
],
-
1
,
tokenizer
.
tokenize
(
'
\n
'
)[
0
]]
=
-
1e10
# disable "\n" after ":"
# Always the last stage should have an output.
assert
logits
is
not
None
# Sample.
last_token_logits
=
logits
[:,
-
1
,
:]
new_sample
=
sample
(
last_token_logits
,
top_k
=
top_k
,
top_p
=
top_p
,
temperature
=
temperature
,
vocab_size
=
tokenizer
.
vocab_size
)
if
top_p
>
0.0
and
top_p_decay
>
0.0
:
top_p
=
top_p
*
top_p_decay
if
top_p_bound
>
0.0
:
top_p
=
max
(
top_p
,
top_p_bound
)
# If a prompt length is smaller or equal th current context
# length, it means we have started generating tokens
started
=
lengths
<=
context_length
# Update the tokens.
tokens
[
started
,
context_length
]
=
new_sample
[
started
]
# Calculate the log probabilities.
if
return_output_log_probs
:
log_probs
=
F
.
log_softmax
(
logits
,
dim
=
2
)
if
return_output_log_probs
:
# Pick the tokens that we need to get the log
# probabilities for. Note that next input token is
# the token which we selected in the current logits,
# so shift by 1.
indices
=
torch
.
unsqueeze
(
tokens
[
:,
(
prev_context_length
+
1
):(
context_length
+
1
)],
2
)
output_log_probs
[:,
prev_context_length
:
context_length
]
=
\
torch
.
gather
(
log_probs
,
2
,
indices
).
squeeze
(
2
)
# Update the tokens on the first stage so the next input to
# the network is correct.
copy_from_last_to_first_pipeline_stage
(
batch_size
,
torch
.
int64
,
tokens
[:,
context_length
])
# Update the context length for the next token generation.
prev_context_length
=
context_length
# Check if all the sequences have hit the termination_id.
done
=
None
if
mpu
.
is_pipeline_last_stage
():
# TODO(rprenger) These stopping methods are tokenizer dependent
# instead tokenization should be in the inference loop so stop sequences can be used
if
stop_on_double_eol
:
hit_double_eol
=
(
new_sample
==
628
).
byte
()
&
started
.
byte
()
hit_two_eols
=
(
new_sample
==
198
).
byte
()
&
(
tokens
[:,
context_length
-
1
]
==
198
).
byte
()
&
started
.
byte
()
done_token
=
hit_double_eol
|
hit_two_eols
elif
stop_on_eol
:
hit_double_eol
=
(
new_sample
==
628
).
byte
()
&
started
.
byte
()
hit_eol
=
(
new_sample
==
198
).
byte
()
&
started
.
byte
()
done_token
=
hit_double_eol
|
hit_eol
else
:
done_token
=
(
new_sample
==
termination_id
).
byte
()
&
\
started
.
byte
()
just_finished
=
(
done_token
&
~
is_generation_done
).
bool
()
generated_sequence_lengths
[
just_finished
.
view
(
-
1
)]
=
\
context_length
+
1
is_generation_done
=
is_generation_done
|
done_token
done
=
torch
.
all
(
is_generation_done
)
done
=
broadcast_from_last_pipeline_stage
(
1
,
torch
.
uint8
,
tensor
=
done
)
if
use_eod_token_for_early_termination
and
done
:
break
# ===================================================
# Update the length of based on max generated length.
# ===================================================
tokens
=
tokens
[:,
:(
context_length
+
1
)]
if
mpu
.
is_pipeline_last_stage
():
if
return_output_log_probs
:
output_log_probs
=
output_log_probs
[:,
:
context_length
]
# ======================================
# Broadcast to the first pipeline stage.
# ======================================
generated_sequence_lengths
=
broadcast_from_last_to_first_pipeline_stage
(
batch_size
,
torch
.
int64
,
generated_sequence_lengths
)
if
return_output_log_probs
:
output_log_probs_size
=
(
batch_size
,
context_length
)
output_log_probs
=
broadcast_from_last_to_first_pipeline_stage
(
output_log_probs_size
,
torch
.
float32
,
output_log_probs
)
return
tokens
,
generated_sequence_lengths
,
output_log_probs
,
None
def
beam_search_and_return_on_first_stage
(
model
,
tokens
,
lengths
,
beam_size
,
stop_token
,
num_return_gen
,
length_penalty
,
prevent_newline_after_colon
=
True
):
args
=
get_args
()
tokenizer
=
get_tokenizer
()
batch_size
=
tokens
.
size
(
0
)
assert
(
batch_size
==
1
)
prompt_length
=
lengths
.
item
()
final_sequence_length
=
tokens
.
size
(
1
)
final_sequence_length
=
min
(
final_sequence_length
,
args
.
max_position_embeddings
)
# If the context is too big, this happens
if
prompt_length
>=
final_sequence_length
:
raise
ValueError
(
"context length + tokens_to_generate too large"
)
# forward step.
forward_step
=
ForwardStep
(
model
,
beam_size
,
final_sequence_length
)
beam_hyp
=
BeamHypotheses
(
beam_size
,
length_penalty
)
best_batches
=
None
done
=
torch
.
zeros
(
1
,
dtype
=
torch
.
uint8
,
device
=
torch
.
cuda
.
current_device
())
scores
=
torch
.
zeros
(
beam_size
,
dtype
=
torch
.
float32
,
device
=
torch
.
cuda
.
current_device
()).
unsqueeze
(
1
)
scores_size_tensor
,
tokens_size_tensor
=
None
,
None
# =============
# Run infernece
# =============
with
torch
.
no_grad
():
tokens
=
tokens
.
repeat
(
beam_size
,
1
)
attention_mask
,
position_ids
=
_build_attention_mask_and_position_ids
(
tokens
)
prev_context_length
=
0
for
context_length
in
range
(
prompt_length
,
final_sequence_length
):
# Pick the slice that we need to pass through the network.
tokens2use
=
tokens
[:,
prev_context_length
:
context_length
]
positions2use
=
position_ids
[:,
prev_context_length
:
context_length
]
attention_mask2use
=
attention_mask
[
...,
prev_context_length
:
context_length
,
:
context_length
]
# logits will be meanigful only in the last pipeline stage.
logits
=
forward_step
(
tokens2use
,
positions2use
,
attention_mask2use
)
if
mpu
.
is_pipeline_last_stage
():
if
prevent_newline_after_colon
:
logits
[
tokens2use
[:,
-
1
]
==
tokenizer
.
tokenize
(
':'
)[
0
],
-
1
,
tokenizer
.
tokenize
(
'
\n
'
)[
0
]]
=
-
1e10
# disable "\n" after ":"
vocab_size
=
logits
.
size
(
2
)
log_probs
=
F
.
log_softmax
(
logits
,
dim
=
2
)
new_scores
=
log_probs
[:,
-
1
,
:]
+
scores
if
context_length
==
prompt_length
:
# if this is the first one
sorted_scores
,
indices
=
torch
.
sort
(
new_scores
[
0
,:],
descending
=
True
)
else
:
sorted_scores
,
indices
=
torch
.
sort
(
new_scores
.
view
(
-
1
),
descending
=
True
)
best_beam_ids
=
torch
.
div
(
indices
[:
2
*
beam_size
],
vocab_size
).
trunc
().
long
()
best_words
=
indices
[:
2
*
beam_size
]
%
vocab_size
best_scores
=
sorted_scores
[:
2
*
beam_size
]
next_beams
=
[]
for
beam_token_rank
,
(
token_id
,
beam_score
,
beam_id
)
in
enumerate
(
zip
(
best_words
,
best_scores
,
best_beam_ids
)
):
if
token_id
.
item
()
==
stop_token
:
# if beam_token does not belong to top num_beams tokens, it should not be added
is_beam_token_worse_than_top_num_beams
=
beam_token_rank
>=
beam_size
if
is_beam_token_worse_than_top_num_beams
:
continue
beam_hyp
.
add
(
tokens
[
beam_id
].
clone
(),
beam_score
,
context_length
+
1
-
prompt_length
)
else
:
# add next predicted token since it is not eos_token
next_beams
.
append
((
token_id
,
beam_score
,
beam_id
))
if
len
(
next_beams
)
==
beam_size
:
break
if
beam_hyp
.
is_done
(
best_scores
.
max
().
item
(),
context_length
+
1
-
prompt_length
):
done
=
torch
.
ones
(
1
,
dtype
=
torch
.
uint8
,
device
=
torch
.
cuda
.
current_device
())
best_batches
=
tokens
.
new
([
item
[
2
]
for
item
in
next_beams
])
tokens
=
tokens
[
best_batches
,:]
tokens
[:,
context_length
]
=
tokens
.
new
([
item
[
0
]
for
item
in
next_beams
])
scores
=
scores
.
new
([
item
[
1
]
for
item
in
next_beams
]).
unsqueeze
(
1
)
# torch.distributed.barrier()
done
=
broadcast_from_last_pipeline_stage
(
1
,
torch
.
uint8
,
done
)
if
done
:
break
# Update the tokens on the first stage so the next input to
# the network is correct.
copy_from_last_to_first_pipeline_stage
(
tokens
.
size
(),
torch
.
int64
,
tokens
)
# set inference key values to make it consistent with best beam index
best_batches
=
broadcast_from_last_pipeline_stage
(
beam_size
,
torch
.
int64
,
best_batches
)
forward_step
.
inference_params
.
swap_key_value_dict
(
best_batches
)
# Update the context length for the next token generation.
prev_context_length
=
context_length
if
mpu
.
is_pipeline_last_stage
():
# if cannot find stop token, add open beams to hyps
if
not
done
:
for
beam_id
in
range
(
beam_size
):
beam_hyp
.
add
(
tokens
[
beam_id
].
clone
(),
scores
[
beam_id
].
squeeze
(),
context_length
+
1
-
prompt_length
)
# rank based on scores
sorted_hyps
=
sorted
(
beam_hyp
.
beams
,
key
=
lambda
x
:
x
[
0
],
reverse
=
True
)
num_return_gen
=
min
(
num_return_gen
,
len
(
sorted_hyps
))
scores
=
[
sorted_hyps
[
i
][
0
]
for
i
in
range
(
num_return_gen
)]
tokens
=
[
sorted_hyps
[
i
][
1
]
for
i
in
range
(
num_return_gen
)]
scores
=
torch
.
stack
(
scores
,
dim
=
0
)
tokens
=
torch
.
stack
(
tokens
,
dim
=
0
)
scores_size_tensor
=
torch
.
tensor
(
scores
.
shape
,
dtype
=
torch
.
int64
,
device
=
torch
.
cuda
.
current_device
())
tokens_size_tensor
=
torch
.
tensor
(
tokens
.
shape
,
dtype
=
torch
.
int64
,
device
=
torch
.
cuda
.
current_device
())
scores_size_tensor
=
broadcast_from_last_pipeline_stage
(
1
,
torch
.
int64
,
scores_size_tensor
)
tokens_size_tensor
=
broadcast_from_last_pipeline_stage
(
2
,
torch
.
int64
,
tokens_size_tensor
)
scores
=
broadcast_from_last_to_first_pipeline_stage
(
tuple
(
scores_size_tensor
),
torch
.
float32
,
scores
)
tokens
=
broadcast_from_last_to_first_pipeline_stage
(
tuple
(
tokens_size_tensor
),
torch
.
int64
,
tokens
)
return
tokens
,
scores
def
_build_attention_mask_and_position_ids
(
tokens
):
"""Build the attention mask and postition ids for the input tokens."""
# Since we are not interested in loss-mask and reset attention/position
# is also False, eod_token is not used so it is safe to set it to None.
attention_mask
,
_
,
position_ids
=
get_ltor_masks_and_position_ids
(
data
=
tokens
,
eod_token
=
None
,
reset_position_ids
=
False
,
reset_attention_mask
=
False
,
eod_mask_loss
=
False
)
return
attention_mask
,
position_ids
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/sampling.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Sampling utilities.
Part of this code is inspired by:
- https://github.com/ari-holtzman/degen/blob/master/gen.py
- https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html
"""
import
torch
def
modify_logits_for_top_k_filtering
(
logits
,
top_k
):
"""Set the logits for none top-k values to -inf."""
filter_
=
logits
<
torch
.
topk
(
logits
,
top_k
)[
0
][...,
-
1
,
None
]
logits
.
masked_fill_
(
filter_
,
float
(
'-Inf'
))
def
modify_logits_for_top_p_filtering
(
logits
,
top_p
):
"""Set the logits for none top-p values to -inf."""
# First sort and calculate cumulative sum of probabilities.
sorted_logits
,
sorted_indices
=
torch
.
sort
(
logits
,
descending
=
True
)
cumulative_probs
=
sorted_logits
.
softmax
(
dim
=-
1
).
cumsum
(
dim
=-
1
)
# Filteration based on the cumulative sum.
filter_
=
cumulative_probs
>
top_p
# This shift by 1 is weird and I cannot justify it. This existed
# in the original implementation:
# https://github.com/ari-holtzman/degen/blob/master/gen.py
# and I guess it is needed so keeping it for now.
filter_
[:,
1
:]
=
filter_
[:,
:
-
1
].
clone
()
# Make sure we at least have one token to select from.
filter_
[...,
0
]
=
0
# Fill in the filtered part
filter_
=
filter_
.
scatter
(
1
,
sorted_indices
,
filter_
)
logits
.
masked_fill_
(
filter_
,
float
(
'-Inf'
))
def
sample
(
logits
,
top_k
=
0
,
top_p
=
0.0
,
temperature
=
1.0
,
vocab_size
=
None
):
""" Sample and generate a token.
Note: logits has the dimension [b, v] where b is the batch size
and v is the vocabulary size.
If vocab_size is provided, we will make sure the sample that is
generated is in [0, vocab-size). This will avoid out of vocabulary
generations due to padding.
"""
# Check logits for consistency.
assert
logits
.
ndim
==
2
,
'expected the logits to be of [b, v] shape.'
assert
logits
.
type
()
==
'torch.cuda.FloatTensor'
,
\
'input logits should be floats.'
# Greedy is just simple argmax.
if
top_k
==
1
:
assert
top_p
==
0.0
,
'cannot set both greedy and top-p samplings.'
samples
=
torch
.
argmax
(
logits
,
dim
=-
1
)
# Top-k or top-p sampling.
else
:
# Clone so we do not modify the inputs,
logits
=
logits
.
clone
()
# Apply temperature in place.
if
temperature
!=
1.0
:
logits
.
div_
(
temperature
)
if
top_k
>
1
:
assert
top_p
==
0.0
,
'cannot set both top-k and top-p samplings.'
assert
top_k
<=
logits
.
size
(
1
),
'top-k is larger than logit size.'
if
vocab_size
:
assert
top_k
<
vocab_size
,
'top-k is larger than vocab size.'
modify_logits_for_top_k_filtering
(
logits
,
top_k
)
elif
top_p
>
0.0
:
assert
top_p
<=
1.0
,
'top-p should be in (0, 1].'
modify_logits_for_top_p_filtering
(
logits
,
top_p
)
# After filtering, we need to recalculate the distribution.
probs
=
logits
.
softmax
(
dim
=-
1
)
samples
=
torch
.
multinomial
(
probs
,
num_samples
=
1
).
view
(
-
1
)
# If vocab size is provided, make sure the samples are in
# in the range [0, vocab-size).
if
vocab_size
:
samples
=
torch
.
clamp
(
samples
,
min
=
0
,
max
=
(
vocab_size
-
1
))
return
samples
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation/tokenization.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Tokenization utilities."""
import
torch
from
megatron.training
import
get_tokenizer
,
get_args
from
.communication
import
broadcast_int_list
,
broadcast_tensor
def
detokenize_generations
(
tokens_gpu_tensor
,
lengths_gpu_tensor
,
return_segments
):
"""Detokenize the generated tokens."""
tokenizer
=
get_tokenizer
()
args
=
get_args
()
prompts_plus_generations
=
[]
if
return_segments
:
prompts_plus_generations_segments
=
[]
tokens
=
tokens_gpu_tensor
.
cpu
().
numpy
().
tolist
()
lengths
=
lengths_gpu_tensor
.
cpu
().
numpy
().
tolist
()
for
sequence_tokens
,
length
in
zip
(
tokens
,
lengths
):
sequence_tokens
=
sequence_tokens
[:
length
]
prompts_plus_generations
.
append
(
tokenizer
.
detokenize
(
sequence_tokens
))
if
return_segments
:
words
=
[]
for
token
in
sequence_tokens
:
if
args
.
tokenizer_type
in
[
'SentencePieceTokenizer'
,
'GPTSentencePieceTokenizer'
,
'Llama2Tokenizer'
]:
word
=
tokenizer
.
decoder
[
token
]
elif
args
.
tokenizer_type
==
'NullTokenizer'
:
word
=
str
(
token
)
else
:
word
=
tokenizer
.
tokenizer
.
decoder
[
token
]
word
=
bytearray
(
[
tokenizer
.
tokenizer
.
byte_decoder
[
c
]
for
c
in
word
]).
decode
(
'utf-8'
,
errors
=
'replace'
)
words
.
append
(
word
)
prompts_plus_generations_segments
.
append
(
words
)
if
return_segments
:
return
tokens
,
prompts_plus_generations
,
\
prompts_plus_generations_segments
return
tokens
,
prompts_plus_generations
def
tokenize_prompts
(
prompts
=
None
,
tokens_to_generate
=
None
,
add_BOS
=
None
,
rank
=
0
):
"""Tokenize prompts and make them avaiable on all ranks."""
# On all ranks set to None so we can pass them to functions
sizes_list
=
None
prompts_tokens_cuda_long_tensor
=
None
prompts_length_cuda_long_tensor
=
None
# On the specified rank, build the above.
if
torch
.
distributed
.
get_rank
()
==
rank
:
assert
prompts
is
not
None
assert
tokens_to_generate
is
not
None
# Tensor of tokens padded and their unpadded length.
prompts_tokens_cuda_long_tensor
,
prompts_length_cuda_long_tensor
=
\
_tokenize_prompts_and_batch
(
prompts
,
tokens_to_generate
,
add_BOS
)
# We need the sizes of these tensors for the boradcast
sizes_list
=
[
prompts_tokens_cuda_long_tensor
.
size
(
0
),
# Batch size
prompts_tokens_cuda_long_tensor
.
size
(
1
)]
# Sequence lenght
# First, broadcast the sizes.
sizes_tensor
=
broadcast_int_list
(
2
,
int_list
=
sizes_list
,
rank
=
rank
)
# Now that we have the sizes, we can boradcast the tokens
# and length tensors.
sizes
=
sizes_tensor
.
tolist
()
prompts_tokens_cuda_long_tensor
=
broadcast_tensor
(
sizes
,
torch
.
int64
,
tensor
=
prompts_tokens_cuda_long_tensor
,
rank
=
rank
)
prompts_length_cuda_long_tensor
=
broadcast_tensor
(
sizes
[
0
],
torch
.
int64
,
tensor
=
prompts_length_cuda_long_tensor
,
rank
=
rank
)
return
prompts_tokens_cuda_long_tensor
,
prompts_length_cuda_long_tensor
def
_tokenize_prompts_and_batch
(
prompts
,
tokens_to_generate
,
add_BOS
):
"""Given a set of prompts and number of tokens to generate:
- tokenize prompts
- set the sequence length to be the max of length of prompts
plus the number of tokens we would like to generate
- pad all the sequences to this length so we can convert them
into a 2D tensor.
"""
# Tokenize all the prompts.
tokenizer
=
get_tokenizer
()
if
add_BOS
:
prompts_tokens
=
[[
tokenizer
.
eod
]
+
tokenizer
.
tokenize
(
prompt
)
for
prompt
in
prompts
]
else
:
prompts_tokens
=
[
tokenizer
.
tokenize
(
prompt
)
for
prompt
in
prompts
]
# Now we have a list of list of tokens which each list has a different
# size. We want to extend this list to:
# - incorporate the tokens that need to be generated
# - make all the sequences equal length.
# Get the prompts length.
prompts_length
=
[
len
(
prompt_tokens
)
for
prompt_tokens
in
prompts_tokens
]
# Get the max prompts length.
max_prompt_len
=
max
(
prompts_length
)
# Number of tokens in the each sample of the batch.
samples_length
=
max_prompt_len
+
tokens_to_generate
# Now update the list of list to be of the same size: samples_length.
for
prompt_tokens
,
prompt_length
in
zip
(
prompts_tokens
,
prompts_length
):
padding_size
=
samples_length
-
prompt_length
prompt_tokens
.
extend
([
tokenizer
.
eod
]
*
padding_size
)
# Now we are in a structured format, we can convert to tensors.
prompts_tokens_tensor
=
torch
.
tensor
(
prompts_tokens
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
prompts_length_tensor
=
torch
.
tensor
(
prompts_length
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
return
prompts_tokens_tensor
,
prompts_length_tensor
Megatron-LM-core_r0.7.0.beta/megatron/inference/text_generation_server.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
datetime
import
torch
import
json
import
threading
from
flask
import
Flask
,
request
,
jsonify
,
current_app
from
flask_restful
import
Resource
,
Api
from
megatron.training
import
get_args
from
megatron.inference.text_generation
import
generate_and_post_process
from
megatron.inference.text_generation
import
beam_search_and_post_process
GENERATE_NUM
=
0
BEAM_NUM
=
1
lock
=
threading
.
Lock
()
class
MegatronGenerate
(
Resource
):
def
__init__
(
self
,
model
):
self
.
model
=
model
@
staticmethod
def
send_do_generate
():
choice
=
torch
.
tensor
([
GENERATE_NUM
],
dtype
=
torch
.
long
,
device
=
'cuda'
)
torch
.
distributed
.
broadcast
(
choice
,
0
)
@
staticmethod
def
send_do_beam_search
():
choice
=
torch
.
tensor
([
BEAM_NUM
],
dtype
=
torch
.
long
,
device
=
'cuda'
)
torch
.
distributed
.
broadcast
(
choice
,
0
)
def
put
(
self
):
args
=
get_args
()
if
not
"prompts"
in
request
.
get_json
():
return
"prompts argument required"
,
400
if
"max_len"
in
request
.
get_json
():
return
"max_len is no longer used. Replace with tokens_to_generate"
,
400
if
"sentences"
in
request
.
get_json
():
return
"sentences is no longer used. Replace with prompts"
,
400
prompts
=
request
.
get_json
()[
"prompts"
]
if
not
isinstance
(
prompts
,
list
):
return
"prompts is not a list of strings"
,
400
if
len
(
prompts
)
==
0
:
return
"prompts is empty"
,
400
if
len
(
prompts
)
>
128
:
return
"Maximum number of prompts is 128"
,
400
tokens_to_generate
=
64
# Choosing hopefully sane default. Full sequence is slow
if
"tokens_to_generate"
in
request
.
get_json
():
tokens_to_generate
=
request
.
get_json
()[
"tokens_to_generate"
]
if
not
isinstance
(
tokens_to_generate
,
int
):
return
"tokens_to_generate must be an integer greater than 0"
if
tokens_to_generate
<
0
:
return
"tokens_to_generate must be an integer greater than or equal to 0"
logprobs
=
False
if
"logprobs"
in
request
.
get_json
():
logprobs
=
request
.
get_json
()[
"logprobs"
]
if
not
isinstance
(
logprobs
,
bool
):
return
"logprobs must be a boolean value"
if
tokens_to_generate
==
0
and
not
logprobs
:
return
"tokens_to_generate=0 implies logprobs should be True"
temperature
=
1.0
if
"temperature"
in
request
.
get_json
():
temperature
=
request
.
get_json
()[
"temperature"
]
if
not
(
type
(
temperature
)
==
int
or
type
(
temperature
)
==
float
):
return
"temperature must be a positive number less than or equal to 100.0"
if
not
(
0.0
<
temperature
<=
100.0
):
return
"temperature must be a positive number less than or equal to 100.0"
top_k
=
0.0
if
"top_k"
in
request
.
get_json
():
top_k
=
request
.
get_json
()[
"top_k"
]
if
not
(
type
(
top_k
)
==
int
):
return
"top_k must be an integer equal to or greater than 0 and less than or equal to 1000"
if
not
(
0
<=
top_k
<=
1000
):
return
"top_k must be equal to or greater than 0 and less than or equal to 1000"
top_p
=
0.0
if
"top_p"
in
request
.
get_json
():
top_p
=
request
.
get_json
()[
"top_p"
]
if
not
(
type
(
top_p
)
==
float
):
return
"top_p must be a positive float less than or equal to 1.0"
if
top_p
>
0.0
and
top_k
>
0.0
:
return
"cannot set both top-k and top-p samplings."
if
not
(
0
<=
top_p
<=
1.0
):
return
"top_p must be less than or equal to 1.0"
top_p_decay
=
0.0
if
"top_p_decay"
in
request
.
get_json
():
top_p_decay
=
request
.
get_json
()[
"top_p_decay"
]
if
not
(
type
(
top_p_decay
)
==
float
):
return
"top_p_decay must be a positive float less than or equal to 1.0"
if
top_p
==
0.0
:
return
"top_p_decay cannot be set without top_p"
if
not
(
0
<=
top_p_decay
<=
1.0
):
return
"top_p_decay must be less than or equal to 1.0"
top_p_bound
=
0.0
if
"top_p_bound"
in
request
.
get_json
():
top_p_bound
=
request
.
get_json
()[
"top_p_bound"
]
if
not
(
type
(
top_p_bound
)
==
float
):
return
"top_p_bound must be a positive float less than or equal to top_p"
if
top_p
==
0.0
:
return
"top_p_bound cannot be set without top_p"
if
not
(
0.0
<
top_p_bound
<=
top_p
):
return
"top_p_bound must be greater than 0 and less than top_p"
add_BOS
=
False
if
"add_BOS"
in
request
.
get_json
():
add_BOS
=
request
.
get_json
()[
"add_BOS"
]
if
not
isinstance
(
add_BOS
,
bool
):
return
"add_BOS must be a boolean value"
if
any
([
len
(
prompt
)
==
0
for
prompt
in
prompts
])
and
not
add_BOS
:
return
"Empty prompts require add_BOS=true"
stop_on_double_eol
=
False
if
"stop_on_double_eol"
in
request
.
get_json
():
stop_on_double_eol
=
request
.
get_json
()[
"stop_on_double_eol"
]
if
not
isinstance
(
stop_on_double_eol
,
bool
):
return
"stop_on_double_eol must be a boolean value"
stop_on_eol
=
False
if
"stop_on_eol"
in
request
.
get_json
():
stop_on_eol
=
request
.
get_json
()[
"stop_on_eol"
]
if
not
isinstance
(
stop_on_eol
,
bool
):
return
"stop_on_eol must be a boolean value"
prevent_newline_after_colon
=
False
if
"prevent_newline_after_colon"
in
request
.
get_json
():
prevent_newline_after_colon
=
request
.
get_json
()[
"prevent_newline_after_colon"
]
if
not
isinstance
(
prevent_newline_after_colon
,
bool
):
return
"prevent_newline_after_colon must be a boolean value"
random_seed
=
-
1
if
"random_seed"
in
request
.
get_json
():
random_seed
=
request
.
get_json
()[
"random_seed"
]
if
not
isinstance
(
random_seed
,
int
):
return
"random_seed must be integer"
if
random_seed
<
0
:
return
"random_seed must be a positive integer"
no_log
=
False
if
"no_log"
in
request
.
get_json
():
no_log
=
request
.
get_json
()[
"no_log"
]
if
not
isinstance
(
no_log
,
bool
):
return
"no_log must be a boolean value"
beam_width
=
None
if
"beam_width"
in
request
.
get_json
():
beam_width
=
request
.
get_json
()[
"beam_width"
]
if
not
isinstance
(
beam_width
,
int
):
return
"beam_width must be integer"
if
beam_width
<
1
:
return
"beam_width must be an integer > 1"
if
len
(
prompts
)
>
1
:
return
"When doing beam_search, batch size must be 1"
stop_token
=
50256
if
"stop_token"
in
request
.
get_json
():
stop_token
=
request
.
get_json
()[
"stop_token"
]
if
not
isinstance
(
stop_token
,
int
):
return
"stop_token must be an integer"
length_penalty
=
1
if
"length_penalty"
in
request
.
get_json
():
length_penalty
=
request
.
get_json
()[
"length_penalty"
]
if
not
isinstance
(
length_penalty
,
float
):
return
"length_penalty must be a float"
with
lock
:
# Need to get lock to keep multiple threads from hitting code
if
not
no_log
:
print
(
"request IP: "
+
str
(
request
.
remote_addr
))
print
(
json
.
dumps
(
request
.
get_json
()),
flush
=
True
)
print
(
"start time: "
,
datetime
.
datetime
.
now
())
try
:
if
beam_width
is
not
None
:
MegatronGenerate
.
send_do_beam_search
()
# Tell other ranks we're doing beam_search
response
,
response_seg
,
response_scores
=
\
beam_search_and_post_process
(
self
.
model
,
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
,
beam_size
=
beam_width
,
add_BOS
=
add_BOS
,
stop_token
=
stop_token
,
num_return_gen
=
beam_width
,
# Returning whole beam
length_penalty
=
length_penalty
,
prevent_newline_after_colon
=
prevent_newline_after_colon
)
return
jsonify
({
"text"
:
response
,
"segments"
:
response_seg
,
"scores"
:
response_scores
})
else
:
MegatronGenerate
.
send_do_generate
()
# Tell other ranks we're doing generate
response
,
response_seg
,
response_logprobs
,
_
=
\
generate_and_post_process
(
self
.
model
,
prompts
=
prompts
,
tokens_to_generate
=
tokens_to_generate
,
return_output_log_probs
=
logprobs
,
top_k_sampling
=
top_k
,
top_p_sampling
=
top_p
,
top_p_decay
=
top_p_decay
,
top_p_bound
=
top_p_bound
,
temperature
=
temperature
,
add_BOS
=
add_BOS
,
use_eod_token_for_early_termination
=
True
,
stop_on_double_eol
=
stop_on_double_eol
,
stop_on_eol
=
stop_on_eol
,
prevent_newline_after_colon
=
prevent_newline_after_colon
,
random_seed
=
random_seed
)
return
jsonify
({
"text"
:
response
,
"segments"
:
response_seg
,
"logprobs"
:
response_logprobs
})
except
ValueError
as
ve
:
return
ve
.
args
[
0
]
print
(
"end time: "
,
datetime
.
datetime
.
now
())
class
MegatronServer
(
object
):
def
__init__
(
self
,
model
):
self
.
app
=
Flask
(
__name__
,
static_url_path
=
''
)
api
=
Api
(
self
.
app
)
api
.
add_resource
(
MegatronGenerate
,
'/api'
,
resource_class_args
=
[
model
])
def
run
(
self
,
url
,
port
):
self
.
app
.
run
(
url
,
threaded
=
True
,
debug
=
False
,
port
=
port
)
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/__init__.py
0 → 100644
View file @
bc5c7fa7
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/autoaugment.py
0 → 100644
View file @
bc5c7fa7
"""AutoAugment data augmentation policy for ImageNet.
-- Begin license text.
MIT License
Copyright (c) 2018 Philip Popien
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-- End license text.
Code adapted from https://github.com/DeepVoltaire/AutoAugment.
This module implements the fixed AutoAugment data augmentation policy for ImageNet provided in
Appendix A, Table 9 of reference [1]. It does not include any of the search code for augmentation
policies.
Reference:
[1] https://arxiv.org/abs/1805.09501
"""
import
random
import
numpy
as
np
from
PIL
import
Image
from
PIL
import
ImageEnhance
from
PIL
import
ImageOps
_MAX_LEVEL
=
10
# Maximum integer strength of an augmentation, if applicable.
class
ImageNetPolicy
:
"""Definition of an ImageNetPolicy.
Implements a fixed AutoAugment data augmentation policy targeted at
ImageNet training by randomly applying at runtime one of the 25 pre-defined
data augmentation sub-policies provided in Reference [1].
Usage example as a Pytorch Transform:
>>> transform=transforms.Compose([transforms.Resize(256),
>>> ImageNetPolicy(),
>>> transforms.ToTensor()])
"""
def
__init__
(
self
,
fillcolor
=
(
128
,
128
,
128
)):
"""Initialize an ImageNetPolicy.
Args:
fillcolor (tuple): RGB color components of the color to be used for
filling when needed (default: (128, 128, 128), which
corresponds to gray).
"""
# Instantiate a list of sub-policies.
# Each entry of the list is a SubPolicy which consists of
# two augmentation operations,
# each of those parametrized as operation, probability, magnitude.
# Those two operations are applied sequentially on the image upon call.
self
.
policies
=
[
SubPolicy
(
"posterize"
,
0.4
,
8
,
"rotate"
,
0.6
,
9
,
fillcolor
),
SubPolicy
(
"solarize"
,
0.6
,
5
,
"autocontrast"
,
0.6
,
5
,
fillcolor
),
SubPolicy
(
"equalize"
,
0.8
,
8
,
"equalize"
,
0.6
,
3
,
fillcolor
),
SubPolicy
(
"posterize"
,
0.6
,
7
,
"posterize"
,
0.6
,
6
,
fillcolor
),
SubPolicy
(
"equalize"
,
0.4
,
7
,
"solarize"
,
0.2
,
4
,
fillcolor
),
SubPolicy
(
"equalize"
,
0.4
,
4
,
"rotate"
,
0.8
,
8
,
fillcolor
),
SubPolicy
(
"solarize"
,
0.6
,
3
,
"equalize"
,
0.6
,
7
,
fillcolor
),
SubPolicy
(
"posterize"
,
0.8
,
5
,
"equalize"
,
1.0
,
2
,
fillcolor
),
SubPolicy
(
"rotate"
,
0.2
,
3
,
"solarize"
,
0.6
,
8
,
fillcolor
),
SubPolicy
(
"equalize"
,
0.6
,
8
,
"posterize"
,
0.4
,
6
,
fillcolor
),
SubPolicy
(
"rotate"
,
0.8
,
8
,
"color"
,
0.4
,
0
,
fillcolor
),
SubPolicy
(
"rotate"
,
0.4
,
9
,
"equalize"
,
0.6
,
2
,
fillcolor
),
SubPolicy
(
"equalize"
,
0.0
,
7
,
"equalize"
,
0.8
,
8
,
fillcolor
),
SubPolicy
(
"invert"
,
0.6
,
4
,
"equalize"
,
1.0
,
8
,
fillcolor
),
SubPolicy
(
"color"
,
0.6
,
4
,
"contrast"
,
1.0
,
8
,
fillcolor
),
SubPolicy
(
"rotate"
,
0.8
,
8
,
"color"
,
1.0
,
2
,
fillcolor
),
SubPolicy
(
"color"
,
0.8
,
8
,
"solarize"
,
0.8
,
7
,
fillcolor
),
SubPolicy
(
"sharpness"
,
0.4
,
7
,
"invert"
,
0.6
,
8
,
fillcolor
),
SubPolicy
(
"shearX"
,
0.6
,
5
,
"equalize"
,
1.0
,
9
,
fillcolor
),
SubPolicy
(
"color"
,
0.4
,
0
,
"equalize"
,
0.6
,
3
,
fillcolor
),
SubPolicy
(
"equalize"
,
0.4
,
7
,
"solarize"
,
0.2
,
4
,
fillcolor
),
SubPolicy
(
"solarize"
,
0.6
,
5
,
"autocontrast"
,
0.6
,
5
,
fillcolor
),
SubPolicy
(
"invert"
,
0.6
,
4
,
"equalize"
,
1.0
,
8
,
fillcolor
),
SubPolicy
(
"color"
,
0.6
,
4
,
"contrast"
,
1.0
,
8
,
fillcolor
),
SubPolicy
(
"equalize"
,
0.8
,
8
,
"equalize"
,
0.6
,
3
,
fillcolor
),
]
def
__call__
(
self
,
img
):
"""Define call method for ImageNetPolicy class."""
policy_idx
=
random
.
randint
(
0
,
len
(
self
.
policies
)
-
1
)
return
self
.
policies
[
policy_idx
](
img
)
def
__repr__
(
self
):
"""Define repr method for ImageNetPolicy class."""
return
"ImageNetPolicy"
class
SubPolicy
:
"""Definition of a SubPolicy.
A SubPolicy consists of two augmentation operations,
each of those parametrized as operation, probability, magnitude.
The two operations are applied sequentially on the image upon call.
"""
def
__init__
(
self
,
operation1
,
probability1
,
magnitude_idx1
,
operation2
,
probability2
,
magnitude_idx2
,
fillcolor
,
):
"""Initialize a SubPolicy.
Args:
operation1 (str): Key specifying the first augmentation operation.
There are fourteen key values altogether (see supported_ops below
listing supported operations). probability1 (float): Probability
within [0., 1.] of applying the first augmentation operation.
magnitude_idx1 (int): Integer specifiying the strength of the first
operation as an index further used to derive the magnitude from a
range of possible values.
operation2 (str): Key specifying the second augmentation operation.
probability2 (float): Probability within [0., 1.] of applying the
second augmentation operation.
magnitude_idx2 (int): Integer specifiying the strength of the
second operation as an index further used to derive the magnitude
from a range of possible values.
fillcolor (tuple): RGB color components of the color to be used for
filling.
Returns:
"""
# List of supported operations for operation1 and operation2.
supported_ops
=
[
"shearX"
,
"shearY"
,
"translateX"
,
"translateY"
,
"rotate"
,
"color"
,
"posterize"
,
"solarize"
,
"contrast"
,
"sharpness"
,
"brightness"
,
"autocontrast"
,
"equalize"
,
"invert"
,
]
assert
(
operation1
in
supported_ops
)
and
(
operation2
in
supported_ops
),
"SubPolicy:one of oper1 or oper2 refers to an unsupported operation."
assert
(
0.0
<=
probability1
<=
1.0
and
0.0
<=
probability2
<=
1.0
),
"SubPolicy: prob1 and prob2 should be within [0., 1.]."
assert
(
isinstance
(
magnitude_idx1
,
int
)
and
0
<=
magnitude_idx1
<=
10
),
"SubPolicy: idx1 should be specified as an integer within [0, 10]."
assert
(
isinstance
(
magnitude_idx2
,
int
)
and
0
<=
magnitude_idx2
<=
10
),
"SubPolicy: idx2 should be specified as an integer within [0, 10]."
# Define a dictionary where each key refers to a specific type of
# augmentation and the corresponding value is a range of ten possible
# magnitude values for that augmentation.
num_levels
=
_MAX_LEVEL
+
1
ranges
=
{
"shearX"
:
np
.
linspace
(
0
,
0.3
,
num_levels
),
"shearY"
:
np
.
linspace
(
0
,
0.3
,
num_levels
),
"translateX"
:
np
.
linspace
(
0
,
150
/
331
,
num_levels
),
"translateY"
:
np
.
linspace
(
0
,
150
/
331
,
num_levels
),
"rotate"
:
np
.
linspace
(
0
,
30
,
num_levels
),
"color"
:
np
.
linspace
(
0.0
,
0.9
,
num_levels
),
"posterize"
:
np
.
round
(
np
.
linspace
(
8
,
4
,
num_levels
),
0
).
astype
(
np
.
int32
),
"solarize"
:
np
.
linspace
(
256
,
0
,
num_levels
),
# range [0, 256]
"contrast"
:
np
.
linspace
(
0.0
,
0.9
,
num_levels
),
"sharpness"
:
np
.
linspace
(
0.0
,
0.9
,
num_levels
),
"brightness"
:
np
.
linspace
(
0.0
,
0.9
,
num_levels
),
"autocontrast"
:
[
0
]
*
num_levels
,
# This augmentation doesn't use magnitude parameter.
"equalize"
:
[
0
]
*
num_levels
,
# This augmentation doesn't use magnitude parameter.
"invert"
:
[
0
]
*
num_levels
,
# This augmentation doesn't use magnitude parameter.
}
def
rotate_with_fill
(
img
,
magnitude
):
"""Define rotation transformation with fill.
The input image is first rotated, then it is blended together with
a gray mask of the same size. Note that fillcolor as defined
elsewhere in this module doesn't apply here.
Args:
magnitude (float): rotation angle in degrees.
Returns:
rotated_filled (PIL Image): rotated image with gray filling for
disoccluded areas unveiled by the rotation.
"""
rotated
=
img
.
convert
(
"RGBA"
).
rotate
(
magnitude
)
rotated_filled
=
Image
.
composite
(
rotated
,
Image
.
new
(
"RGBA"
,
rotated
.
size
,
(
128
,)
*
4
),
rotated
)
return
rotated_filled
.
convert
(
img
.
mode
)
# Define a dictionary of augmentation functions where each key refers
# to a specific type of augmentation and the corresponding value defines
# the augmentation itself using a lambda function.
# pylint: disable=unnecessary-lambda
func_dict
=
{
"shearX"
:
lambda
img
,
magnitude
:
img
.
transform
(
img
.
size
,
Image
.
AFFINE
,
(
1
,
magnitude
*
random
.
choice
([
-
1
,
1
]),
0
,
0
,
1
,
0
),
Image
.
BICUBIC
,
fillcolor
=
fillcolor
,
),
"shearY"
:
lambda
img
,
magnitude
:
img
.
transform
(
img
.
size
,
Image
.
AFFINE
,
(
1
,
0
,
0
,
magnitude
*
random
.
choice
([
-
1
,
1
]),
1
,
0
),
Image
.
BICUBIC
,
fillcolor
=
fillcolor
,
),
"translateX"
:
lambda
img
,
magnitude
:
img
.
transform
(
img
.
size
,
Image
.
AFFINE
,
(
1
,
0
,
magnitude
*
img
.
size
[
0
]
*
random
.
choice
([
-
1
,
1
]),
0
,
1
,
0
,
),
fillcolor
=
fillcolor
,
),
"translateY"
:
lambda
img
,
magnitude
:
img
.
transform
(
img
.
size
,
Image
.
AFFINE
,
(
1
,
0
,
0
,
0
,
1
,
magnitude
*
img
.
size
[
1
]
*
random
.
choice
([
-
1
,
1
]),
),
fillcolor
=
fillcolor
,
),
"rotate"
:
lambda
img
,
magnitude
:
rotate_with_fill
(
img
,
magnitude
),
"color"
:
lambda
img
,
magnitude
:
ImageEnhance
.
Color
(
img
).
enhance
(
1
+
magnitude
*
random
.
choice
([
-
1
,
1
])
),
"posterize"
:
lambda
img
,
magnitude
:
ImageOps
.
posterize
(
img
,
magnitude
),
"solarize"
:
lambda
img
,
magnitude
:
ImageOps
.
solarize
(
img
,
magnitude
),
"contrast"
:
lambda
img
,
magnitude
:
ImageEnhance
.
Contrast
(
img
).
enhance
(
1
+
magnitude
*
random
.
choice
([
-
1
,
1
])),
"sharpness"
:
lambda
img
,
magnitude
:
ImageEnhance
.
Sharpness
(
img
).
enhance
(
1
+
magnitude
*
random
.
choice
([
-
1
,
1
])),
"brightness"
:
lambda
img
,
magnitude
:
ImageEnhance
.
Brightness
(
img
).
enhance
(
1
+
magnitude
*
random
.
choice
([
-
1
,
1
])),
"autocontrast"
:
lambda
img
,
magnitude
:
ImageOps
.
autocontrast
(
img
),
"equalize"
:
lambda
img
,
magnitude
:
ImageOps
.
equalize
(
img
),
"invert"
:
lambda
img
,
magnitude
:
ImageOps
.
invert
(
img
),
}
# Store probability, function and magnitude of the first augmentation
# for the sub-policy.
self
.
probability1
=
probability1
self
.
operation1
=
func_dict
[
operation1
]
self
.
magnitude1
=
ranges
[
operation1
][
magnitude_idx1
]
# Store probability, function and magnitude of the second augmentation
# for the sub-policy.
self
.
probability2
=
probability2
self
.
operation2
=
func_dict
[
operation2
]
self
.
magnitude2
=
ranges
[
operation2
][
magnitude_idx2
]
def
__call__
(
self
,
img
):
"""Define call method for SubPolicy class."""
# Randomly apply operation 1.
if
random
.
random
()
<
self
.
probability1
:
img
=
self
.
operation1
(
img
,
self
.
magnitude1
)
# Randomly apply operation 2.
if
random
.
random
()
<
self
.
probability2
:
img
=
self
.
operation2
(
img
,
self
.
magnitude2
)
return
img
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/biencoder_dataset_utils.py
0 → 100644
View file @
bc5c7fa7
import
os
import
time
import
numpy
as
np
import
torch
from
megatron.training
import
get_args
,
get_tokenizer
,
print_rank_0
from
megatron.core
import
mpu
,
tensor_parallel
from
megatron.legacy.data.dataset_utils
import
create_masked_lm_predictions
,
\
pad_and_convert_to_numpy
from
megatron.legacy.data.data_samplers
import
MegatronPretrainingSampler
def
make_attention_mask
(
source_block
,
target_block
):
"""
Returns a 2-dimensional (2-D) attention mask
:param source_block: 1-D array
:param target_block: 1-D array
"""
mask
=
(
target_block
[
None
,
:]
>=
1
)
*
(
source_block
[:,
None
]
>=
1
)
mask
=
mask
.
astype
(
np
.
int64
)
# (source_length, target_length)
return
mask
def
get_one_epoch_dataloader
(
dataset
,
micro_batch_size
=
None
):
"""Specifically one epoch to be used in an indexing job."""
args
=
get_args
()
if
micro_batch_size
is
None
:
micro_batch_size
=
args
.
micro_batch_size
num_workers
=
args
.
num_workers
# Use megatron's sampler with consumed samples set to 0 as
# this is only for evaluation and don't intend to resume half way.
# Also, set the drop last to false as don't intend to remove
# the last batch
batch_sampler
=
MegatronPretrainingSampler
(
total_samples
=
len
(
dataset
),
consumed_samples
=
0
,
micro_batch_size
=
args
.
micro_batch_size
,
data_parallel_rank
=
mpu
.
get_data_parallel_rank
(),
data_parallel_size
=
mpu
.
get_data_parallel_world_size
(),
drop_last
=
False
)
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
def
get_ict_batch
(
data_iterator
):
# Items and their type.
keys
=
[
'query_tokens'
,
'query_mask'
,
'context_tokens'
,
'context_mask'
,
'block_data'
]
datatype
=
torch
.
int64
# Broadcast data.
if
data_iterator
is
None
:
data
=
None
else
:
data
=
next
(
data_iterator
)
data_b
=
tensor_parallel
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
query_tokens
=
data_b
[
'query_tokens'
].
long
()
query_mask
=
data_b
[
'query_mask'
]
<
0.5
context_tokens
=
data_b
[
'context_tokens'
].
long
()
context_mask
=
data_b
[
'context_mask'
]
<
0.5
block_indices
=
data_b
[
'block_data'
].
long
()
return
query_tokens
,
query_mask
,
\
context_tokens
,
context_mask
,
block_indices
def
join_str_list
(
str_list
):
"""Join a list of strings, handling spaces appropriately"""
result
=
""
for
s
in
str_list
:
if
s
.
startswith
(
"##"
):
result
+=
s
[
2
:]
else
:
result
+=
" "
+
s
return
result
class
BlockSampleData
(
object
):
"""A struct for fully describing a fixed-size block of data as used in REALM
:param start_idx: for first sentence of the block
:param end_idx: for last sentence of the block (may be partially truncated in sample construction)
:param doc_idx: the index of the document from which the block comes in the original indexed dataset
:param block_idx: a unique integer identifier given to every block.
"""
def
__init__
(
self
,
start_idx
,
end_idx
,
doc_idx
,
block_idx
):
self
.
start_idx
=
start_idx
self
.
end_idx
=
end_idx
self
.
doc_idx
=
doc_idx
self
.
block_idx
=
block_idx
def
as_array
(
self
):
return
np
.
array
([
self
.
start_idx
,
self
.
end_idx
,
self
.
doc_idx
,
self
.
block_idx
]).
astype
(
np
.
int64
)
def
as_tuple
(
self
):
return
self
.
start_idx
,
self
.
end_idx
,
self
.
doc_idx
,
self
.
block_idx
class
BlockSamplesMapping
(
object
):
def
__init__
(
self
,
mapping_array
):
# make sure that the array is compatible with BlockSampleData
assert
mapping_array
.
shape
[
1
]
==
4
self
.
mapping_array
=
mapping_array
def
__len__
(
self
):
return
self
.
mapping_array
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
"""Get the data associated with an indexed sample."""
sample_data
=
BlockSampleData
(
*
self
.
mapping_array
[
idx
])
return
sample_data
def
get_block_samples_mapping
(
block_dataset
,
title_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
name
,
use_one_sent_docs
=
False
):
"""Get samples mapping for a dataset over fixed size blocks. This function also requires
a dataset of the titles for the source documents since their lengths must be taken into account.
:return: samples_mapping (BlockSamplesMapping)
"""
if
not
num_epochs
:
if
not
max_num_samples
:
raise
ValueError
(
"Need to specify either max_num_samples "
"or num_epochs"
)
num_epochs
=
np
.
iinfo
(
np
.
int32
).
max
-
1
if
not
max_num_samples
:
max_num_samples
=
np
.
iinfo
(
np
.
int64
).
max
-
1
# Filename of the index mapping
indexmap_filename
=
data_prefix
indexmap_filename
+=
'_{}_indexmap'
.
format
(
name
)
if
num_epochs
!=
(
np
.
iinfo
(
np
.
int32
).
max
-
1
):
indexmap_filename
+=
'_{}ep'
.
format
(
num_epochs
)
if
max_num_samples
!=
(
np
.
iinfo
(
np
.
int64
).
max
-
1
):
indexmap_filename
+=
'_{}mns'
.
format
(
max_num_samples
)
indexmap_filename
+=
'_{}msl'
.
format
(
max_seq_length
)
indexmap_filename
+=
'_{}s'
.
format
(
seed
)
if
use_one_sent_docs
:
indexmap_filename
+=
'_1sentok'
indexmap_filename
+=
'.npy'
# Build the indexed mapping if not exist.
if
mpu
.
get_data_parallel_rank
()
==
0
and
\
not
os
.
path
.
isfile
(
indexmap_filename
):
print
(
' > WARNING: could not find index map file {}, building '
'the indices on rank 0 ...'
.
format
(
indexmap_filename
))
# Make sure the types match the helpers input types.
assert
block_dataset
.
document_indices
.
dtype
==
np
.
int64
assert
block_dataset
.
sequence_lengths
.
dtype
==
np
.
int32
# Build samples mapping
verbose
=
torch
.
distributed
.
get_rank
()
==
0
start_time
=
time
.
time
()
print_rank_0
(
' > building samples index mapping for {} ...'
.
format
(
name
))
from
megatron.core.datasets
import
helpers
mapping_array
=
helpers
.
build_blocks_mapping
(
block_dataset
.
document_indices
,
block_dataset
.
sequence_lengths
,
title_dataset
.
sequence_lengths
,
num_epochs
,
max_num_samples
,
max_seq_length
-
3
,
# account for added tokens
seed
,
verbose
,
use_one_sent_docs
)
print_rank_0
(
' > done building samples index mapping'
)
np
.
save
(
indexmap_filename
,
mapping_array
,
allow_pickle
=
True
)
print_rank_0
(
' > saved the index mapping in {}'
.
format
(
indexmap_filename
))
# Make sure all the ranks have built the mapping
print_rank_0
(
' > elapsed time to build and save samples mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
tensor
([
1
],
dtype
=
torch
.
long
,
device
=
'cuda'
)
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
assert
counts
[
0
].
item
()
==
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
# Load indexed dataset.
print_rank_0
(
' > loading indexed mapping from {}'
.
format
(
indexmap_filename
))
start_time
=
time
.
time
()
mapping_array
=
np
.
load
(
indexmap_filename
,
allow_pickle
=
True
,
mmap_mode
=
'r'
)
samples_mapping
=
BlockSamplesMapping
(
mapping_array
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
mapping_array
.
shape
[
0
]))
return
samples_mapping
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/data_samplers.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Dataloaders."""
import
random
import
torch
import
numpy
as
np
from
torch.utils.data
import
Dataset
from
megatron.training
import
get_args
from
megatron.core
import
mpu
def
build_pretraining_data_loader
(
dataset
,
consumed_samples
):
"""Build dataloader given an input dataset."""
if
dataset
is
None
:
return
None
args
=
get_args
()
# Megatron sampler
if
args
.
dataloader_type
==
'single'
:
batch_sampler
=
MegatronPretrainingSampler
(
total_samples
=
len
(
dataset
),
consumed_samples
=
consumed_samples
,
micro_batch_size
=
args
.
micro_batch_size
,
data_parallel_rank
=
mpu
.
get_data_parallel_rank
(),
data_parallel_size
=
mpu
.
get_data_parallel_world_size
())
elif
args
.
dataloader_type
==
'cyclic'
:
batch_sampler
=
MegatronPretrainingRandomSampler
(
dataset
,
total_samples
=
len
(
dataset
),
consumed_samples
=
consumed_samples
,
micro_batch_size
=
args
.
micro_batch_size
,
data_parallel_rank
=
mpu
.
get_data_parallel_rank
(),
data_parallel_size
=
mpu
.
get_data_parallel_world_size
(),
data_sharding
=
args
.
data_sharding
)
elif
args
.
dataloader_type
==
"external"
:
# External dataloaders are passed through. User is expected to provide a
# torch-compatible dataloader and define samplers, if needed.
return
dataset
else
:
raise
Exception
(
'{} dataloader type is not supported.'
.
format
(
args
.
dataloader_type
))
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
args
.
num_workers
,
pin_memory
=
True
,
persistent_workers
=
True
if
args
.
num_workers
>
0
else
False
,
)
class
MegatronPretrainingSampler
:
def
__init__
(
self
,
total_samples
,
consumed_samples
,
micro_batch_size
,
data_parallel_rank
,
data_parallel_size
,
drop_last
=
True
):
# Keep a copy of input params for later use.
self
.
total_samples
=
total_samples
self
.
consumed_samples
=
consumed_samples
self
.
micro_batch_size
=
micro_batch_size
self
.
data_parallel_rank
=
data_parallel_rank
self
.
micro_batch_times_data_parallel_size
=
\
self
.
micro_batch_size
*
data_parallel_size
self
.
drop_last
=
drop_last
# Sanity checks.
assert
self
.
total_samples
>
0
,
\
'no sample to consume: {}'
.
format
(
self
.
total_samples
)
assert
self
.
consumed_samples
<
self
.
total_samples
,
\
'no samples left to consume: {}, {}'
.
format
(
self
.
consumed_samples
,
self
.
total_samples
)
assert
self
.
micro_batch_size
>
0
assert
data_parallel_size
>
0
assert
self
.
data_parallel_rank
<
data_parallel_size
,
\
'data_parallel_rank should be smaller than data size: {}, '
\
'{}'
.
format
(
self
.
data_parallel_rank
,
data_parallel_size
)
def
__len__
(
self
):
return
self
.
total_samples
def
get_start_end_idx
(
self
):
start_idx
=
self
.
data_parallel_rank
*
self
.
micro_batch_size
end_idx
=
start_idx
+
self
.
micro_batch_size
return
start_idx
,
end_idx
def
__iter__
(
self
):
batch
=
[]
# Last batch will be dropped if drop_last is not set False
for
idx
in
range
(
self
.
consumed_samples
,
self
.
total_samples
):
batch
.
append
(
idx
)
if
len
(
batch
)
==
self
.
micro_batch_times_data_parallel_size
:
start_idx
,
end_idx
=
self
.
get_start_end_idx
()
yield
batch
[
start_idx
:
end_idx
]
batch
=
[]
# Check the last partial batch and see drop_last is set
if
len
(
batch
)
>
0
and
not
self
.
drop_last
:
start_idx
,
end_idx
=
self
.
get_start_end_idx
()
yield
batch
[
start_idx
:
end_idx
]
class
RandomSeedDataset
(
Dataset
):
def
__init__
(
self
,
dataset
):
args
=
get_args
()
self
.
base_seed
=
args
.
seed
self
.
curr_seed
=
args
.
seed
self
.
dataset
=
dataset
def
__len__
(
self
):
return
len
(
self
.
dataset
)
def
set_epoch
(
self
,
epoch
):
self
.
curr_seed
=
self
.
base_seed
+
epoch
def
__getitem__
(
self
,
idx
):
seed
=
idx
+
self
.
curr_seed
torch
.
manual_seed
(
seed
)
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
return
self
.
dataset
[
idx
]
class
MegatronPretrainingRandomSampler
:
def
__init__
(
self
,
dataset
,
total_samples
,
consumed_samples
,
micro_batch_size
,
data_parallel_rank
,
data_parallel_size
,
data_sharding
):
# Keep a copy of input params for later use.
self
.
dataset
=
dataset
self
.
total_samples
=
total_samples
self
.
consumed_samples
=
consumed_samples
self
.
micro_batch_size
=
micro_batch_size
self
.
data_parallel_rank
=
data_parallel_rank
self
.
data_parallel_size
=
data_parallel_size
self
.
data_sharding
=
data_sharding
self
.
micro_batch_times_data_parallel_size
=
\
self
.
micro_batch_size
*
data_parallel_size
self
.
last_batch_size
=
\
self
.
total_samples
%
self
.
micro_batch_times_data_parallel_size
# Sanity checks.
assert
self
.
total_samples
>
0
,
\
'no sample to consume: {}'
.
format
(
self
.
total_samples
)
assert
self
.
micro_batch_size
>
0
assert
data_parallel_size
>
0
assert
self
.
data_parallel_rank
<
data_parallel_size
,
\
'data_parallel_rank should be smaller than data size: {}, '
\
'{}'
.
format
(
self
.
data_parallel_rank
,
data_parallel_size
)
def
__len__
(
self
):
return
self
.
total_samples
def
__iter__
(
self
):
active_total_samples
=
self
.
total_samples
-
self
.
last_batch_size
self
.
epoch
=
self
.
consumed_samples
//
active_total_samples
current_epoch_samples
=
self
.
consumed_samples
%
active_total_samples
assert
current_epoch_samples
%
self
.
micro_batch_times_data_parallel_size
==
0
if
isinstance
(
self
.
dataset
,
RandomSeedDataset
):
self
.
dataset
.
set_epoch
(
self
.
epoch
)
# data sharding and random sampling
if
self
.
data_sharding
:
bucket_size
=
(
self
.
total_samples
//
self
.
micro_batch_times_data_parallel_size
)
\
*
self
.
micro_batch_size
bucket_offset
=
current_epoch_samples
//
self
.
data_parallel_size
start_idx
=
self
.
data_parallel_rank
*
bucket_size
g
=
torch
.
Generator
()
g
.
manual_seed
(
self
.
epoch
)
random_idx
=
torch
.
randperm
(
bucket_size
,
generator
=
g
).
tolist
()
idx_range
=
[
start_idx
+
x
for
x
in
random_idx
[
bucket_offset
:]]
else
:
full_bucket_size
=
(
self
.
total_samples
//
self
.
micro_batch_size
)
\
*
self
.
micro_batch_size
full_bucket_offset
=
current_epoch_samples
g
=
torch
.
Generator
()
g
.
manual_seed
(
self
.
epoch
)
idx_range_total
=
\
torch
.
randperm
(
full_bucket_size
,
generator
=
g
).
tolist
()
idx_range_active
=
idx_range_total
[
full_bucket_offset
:]
idx_range
=
idx_range_active
[
self
.
data_parallel_rank
::
self
.
data_parallel_size
]
batch
=
[]
# Last batch if not complete will be dropped.
for
idx
in
idx_range
:
batch
.
append
(
idx
)
if
len
(
batch
)
==
self
.
micro_batch_size
:
self
.
consumed_samples
+=
self
.
micro_batch_times_data_parallel_size
yield
batch
batch
=
[]
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/dataset_utils.py
0 → 100644
View file @
bc5c7fa7
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Most of the code here has been copied from:
# https://github.com/google-research/albert/blob/master/create_pretraining_data.py
# with some modifications.
import
math
import
os
import
time
import
collections
import
numpy
as
np
import
torch
from
megatron.training
import
(
get_args
,
print_rank_0
)
from
megatron.core
import
mpu
from
megatron.core.datasets.indexed_dataset
import
IndexedDataset
DSET_TYPE_BERT
=
'standard_bert'
DSET_TYPE_ICT
=
'ict'
DSET_TYPE_T5
=
't5'
DSET_TYPE_MULTIMODAL
=
'multimodal'
DSET_TYPES
=
[
DSET_TYPE_BERT
,
DSET_TYPE_ICT
,
DSET_TYPE_T5
,
DSET_TYPE_MULTIMODAL
]
def
get_datasets_weights_and_num_samples
(
data_prefix
,
train_valid_test_num_samples
):
# The data prefix should be in the format of:
# weight-1, data-prefix-1, weight-2, data-prefix-2, ..
assert
len
(
data_prefix
)
%
2
==
0
num_datasets
=
len
(
data_prefix
)
//
2
weights
=
[
0
]
*
num_datasets
prefixes
=
[
0
]
*
num_datasets
for
i
in
range
(
num_datasets
):
weights
[
i
]
=
float
(
data_prefix
[
2
*
i
])
prefixes
[
i
]
=
(
data_prefix
[
2
*
i
+
1
]).
strip
()
# Normalize weights
weight_sum
=
0.0
for
weight
in
weights
:
weight_sum
+=
weight
assert
weight_sum
>
0.0
weights
=
[
weight
/
weight_sum
for
weight
in
weights
]
# Add 0.5% (the 1.005 factor) so in case the bleding dataset does
# not uniformly distribute the number of samples, we still have
# samples left to feed to the network.
if
isinstance
(
train_valid_test_num_samples
,
list
):
datasets_train_valid_test_num_samples
=
[]
for
weight
in
weights
:
datasets_train_valid_test_num_samples
.
append
(
[
int
(
math
.
ceil
(
val
*
weight
*
1.005
))
for
val
in
train_valid_test_num_samples
])
else
:
# Used when separate dataset files are provided for train,
# valid and test
datasets_train_valid_test_num_samples
=
[
int
(
math
.
ceil
(
train_valid_test_num_samples
*
weight
*
1.005
))
for
weight
in
weights
]
return
prefixes
,
weights
,
datasets_train_valid_test_num_samples
def
get_a_and_b_segments
(
sample
,
np_rng
):
"""Divide sample into a and b segments."""
# Number of sentences in the sample.
n_sentences
=
len
(
sample
)
# Make sure we always have two sentences.
assert
n_sentences
>
1
,
'make sure each sample has at least two sentences.'
# First part:
# `a_end` is how many sentences go into the `A`.
a_end
=
1
if
n_sentences
>=
3
:
# Note that randin in numpy is exclusive.
a_end
=
np_rng
.
randint
(
1
,
n_sentences
)
tokens_a
=
[]
for
j
in
range
(
a_end
):
tokens_a
.
extend
(
sample
[
j
])
# Second part:
tokens_b
=
[]
for
j
in
range
(
a_end
,
n_sentences
):
tokens_b
.
extend
(
sample
[
j
])
# Random next:
is_next_random
=
False
if
np_rng
.
random
()
<
0.5
:
is_next_random
=
True
tokens_a
,
tokens_b
=
tokens_b
,
tokens_a
return
tokens_a
,
tokens_b
,
is_next_random
def
truncate_segments
(
tokens_a
,
tokens_b
,
len_a
,
len_b
,
max_num_tokens
,
np_rng
):
"""Truncates a pair of sequences to a maximum sequence length."""
#print(len_a, len_b, max_num_tokens)
assert
len_a
>
0
if
len_a
+
len_b
<=
max_num_tokens
:
return
False
while
len_a
+
len_b
>
max_num_tokens
:
if
len_a
>
len_b
:
len_a
-=
1
tokens
=
tokens_a
else
:
len_b
-=
1
tokens
=
tokens_b
if
np_rng
.
random
()
<
0.5
:
del
tokens
[
0
]
else
:
tokens
.
pop
()
return
True
def
create_tokens_and_tokentypes
(
tokens_a
,
tokens_b
,
cls_id
,
sep_id
):
"""Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
tokens
=
[]
tokentypes
=
[]
# [CLS].
tokens
.
append
(
cls_id
)
tokentypes
.
append
(
0
)
# Segment A.
for
token
in
tokens_a
:
tokens
.
append
(
token
)
tokentypes
.
append
(
0
)
# [SEP].
tokens
.
append
(
sep_id
)
tokentypes
.
append
(
0
)
# Segment B.
for
token
in
tokens_b
:
tokens
.
append
(
token
)
tokentypes
.
append
(
1
)
if
tokens_b
:
# [SEP].
tokens
.
append
(
sep_id
)
tokentypes
.
append
(
1
)
return
tokens
,
tokentypes
MaskedLmInstance
=
collections
.
namedtuple
(
"MaskedLmInstance"
,
[
"index"
,
"label"
])
def
is_start_piece
(
piece
):
"""Check if the current word piece is the starting piece (BERT)."""
# When a word has been split into
# WordPieces, the first token does not have any marker and any subsequence
# tokens are prefixed with ##. So whenever we see the ## token, we
# append it to the previous set of word indexes.
return
not
piece
.
startswith
(
"##"
)
def
create_masked_lm_predictions
(
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
np_rng
,
max_ngrams
=
3
,
do_whole_word_mask
=
True
,
favor_longer_ngram
=
False
,
do_permutation
=
False
,
geometric_dist
=
False
,
masking_style
=
"bert"
):
"""Creates the predictions for the masked LM objective.
Note: Tokens here are vocab ids and not text tokens."""
cand_indexes
=
[]
# Note(mingdachen): We create a list for recording if the piece is
# the starting piece of current token, where 1 means true, so that
# on-the-fly whole word masking is possible.
token_boundary
=
[
0
]
*
len
(
tokens
)
for
(
i
,
token
)
in
enumerate
(
tokens
):
if
token
==
cls_id
or
token
==
sep_id
:
token_boundary
[
i
]
=
1
continue
# Whole Word Masking means that if we mask all of the wordpieces
# corresponding to an original word.
#
# Note that Whole Word Masking does *not* change the training code
# at all -- we still predict each WordPiece independently, softmaxed
# over the entire vocabulary.
if
(
do_whole_word_mask
and
len
(
cand_indexes
)
>=
1
and
not
is_start_piece
(
vocab_id_to_token_dict
[
token
])):
cand_indexes
[
-
1
].
append
(
i
)
else
:
cand_indexes
.
append
([
i
])
if
is_start_piece
(
vocab_id_to_token_dict
[
token
]):
token_boundary
[
i
]
=
1
output_tokens
=
list
(
tokens
)
masked_lm_positions
=
[]
masked_lm_labels
=
[]
if
masked_lm_prob
==
0
:
return
(
output_tokens
,
masked_lm_positions
,
masked_lm_labels
,
token_boundary
)
num_to_predict
=
min
(
max_predictions_per_seq
,
max
(
1
,
int
(
round
(
len
(
tokens
)
*
masked_lm_prob
))))
ngrams
=
np
.
arange
(
1
,
max_ngrams
+
1
,
dtype
=
np
.
int64
)
if
not
geometric_dist
:
# Note(mingdachen):
# By default, we set the probilities to favor shorter ngram sequences.
pvals
=
1.
/
np
.
arange
(
1
,
max_ngrams
+
1
)
pvals
/=
pvals
.
sum
(
keepdims
=
True
)
if
favor_longer_ngram
:
pvals
=
pvals
[::
-
1
]
ngram_indexes
=
[]
for
idx
in
range
(
len
(
cand_indexes
)):
ngram_index
=
[]
for
n
in
ngrams
:
ngram_index
.
append
(
cand_indexes
[
idx
:
idx
+
n
])
ngram_indexes
.
append
(
ngram_index
)
np_rng
.
shuffle
(
ngram_indexes
)
(
masked_lms
,
masked_spans
)
=
([],
[])
covered_indexes
=
set
()
for
cand_index_set
in
ngram_indexes
:
if
len
(
masked_lms
)
>=
num_to_predict
:
break
if
not
cand_index_set
:
continue
# Note(mingdachen):
# Skip current piece if they are covered in lm masking or previous ngrams.
for
index_set
in
cand_index_set
[
0
]:
for
index
in
index_set
:
if
index
in
covered_indexes
:
continue
if
not
geometric_dist
:
n
=
np_rng
.
choice
(
ngrams
[:
len
(
cand_index_set
)],
p
=
pvals
[:
len
(
cand_index_set
)]
/
pvals
[:
len
(
cand_index_set
)].
sum
(
keepdims
=
True
))
else
:
# Sampling "n" from the geometric distribution and clipping it to
# the max_ngrams. Using p=0.2 default from the SpanBERT paper
# https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
n
=
min
(
np_rng
.
geometric
(
0.2
),
max_ngrams
)
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
# Note(mingdachen):
# Repeatedly looking for a candidate that does not exceed the
# maximum number of predictions by trying shorter ngrams.
while
len
(
masked_lms
)
+
len
(
index_set
)
>
num_to_predict
:
if
n
==
0
:
break
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
# If adding a whole-word mask would exceed the maximum number of
# predictions, then just skip this candidate.
if
len
(
masked_lms
)
+
len
(
index_set
)
>
num_to_predict
:
continue
is_any_index_covered
=
False
for
index
in
index_set
:
if
index
in
covered_indexes
:
is_any_index_covered
=
True
break
if
is_any_index_covered
:
continue
for
index
in
index_set
:
covered_indexes
.
add
(
index
)
masked_token
=
None
if
masking_style
==
"bert"
:
# 80% of the time, replace with [MASK]
if
np_rng
.
random
()
<
0.8
:
masked_token
=
mask_id
else
:
# 10% of the time, keep original
if
np_rng
.
random
()
<
0.5
:
masked_token
=
tokens
[
index
]
# 10% of the time, replace with random word
else
:
masked_token
=
vocab_id_list
[
np_rng
.
randint
(
0
,
len
(
vocab_id_list
))]
elif
masking_style
==
"t5"
:
masked_token
=
mask_id
else
:
raise
ValueError
(
"invalid value of masking style"
)
output_tokens
[
index
]
=
masked_token
masked_lms
.
append
(
MaskedLmInstance
(
index
=
index
,
label
=
tokens
[
index
]))
masked_spans
.
append
(
MaskedLmInstance
(
index
=
index_set
,
label
=
[
tokens
[
index
]
for
index
in
index_set
]))
assert
len
(
masked_lms
)
<=
num_to_predict
np_rng
.
shuffle
(
ngram_indexes
)
select_indexes
=
set
()
if
do_permutation
:
for
cand_index_set
in
ngram_indexes
:
if
len
(
select_indexes
)
>=
num_to_predict
:
break
if
not
cand_index_set
:
continue
# Note(mingdachen):
# Skip current piece if they are covered in lm masking or previous ngrams.
for
index_set
in
cand_index_set
[
0
]:
for
index
in
index_set
:
if
index
in
covered_indexes
or
index
in
select_indexes
:
continue
n
=
np
.
random
.
choice
(
ngrams
[:
len
(
cand_index_set
)],
p
=
pvals
[:
len
(
cand_index_set
)]
/
pvals
[:
len
(
cand_index_set
)].
sum
(
keepdims
=
True
))
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
while
len
(
select_indexes
)
+
len
(
index_set
)
>
num_to_predict
:
if
n
==
0
:
break
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
# If adding a whole-word mask would exceed the maximum number of
# predictions, then just skip this candidate.
if
len
(
select_indexes
)
+
len
(
index_set
)
>
num_to_predict
:
continue
is_any_index_covered
=
False
for
index
in
index_set
:
if
index
in
covered_indexes
or
index
in
select_indexes
:
is_any_index_covered
=
True
break
if
is_any_index_covered
:
continue
for
index
in
index_set
:
select_indexes
.
add
(
index
)
assert
len
(
select_indexes
)
<=
num_to_predict
select_indexes
=
sorted
(
select_indexes
)
permute_indexes
=
list
(
select_indexes
)
np_rng
.
shuffle
(
permute_indexes
)
orig_token
=
list
(
output_tokens
)
for
src_i
,
tgt_i
in
zip
(
select_indexes
,
permute_indexes
):
output_tokens
[
src_i
]
=
orig_token
[
tgt_i
]
masked_lms
.
append
(
MaskedLmInstance
(
index
=
src_i
,
label
=
orig_token
[
src_i
]))
masked_lms
=
sorted
(
masked_lms
,
key
=
lambda
x
:
x
.
index
)
# Sort the spans by the index of the first span
masked_spans
=
sorted
(
masked_spans
,
key
=
lambda
x
:
x
.
index
[
0
])
for
p
in
masked_lms
:
masked_lm_positions
.
append
(
p
.
index
)
masked_lm_labels
.
append
(
p
.
label
)
return
(
output_tokens
,
masked_lm_positions
,
masked_lm_labels
,
token_boundary
,
masked_spans
)
def
pad_and_convert_to_numpy
(
tokens
,
tokentypes
,
masked_positions
,
masked_labels
,
pad_id
,
max_seq_length
):
"""Pad sequences and convert them to numpy."""
# Some checks.
num_tokens
=
len
(
tokens
)
padding_length
=
max_seq_length
-
num_tokens
assert
padding_length
>=
0
assert
len
(
tokentypes
)
==
num_tokens
assert
len
(
masked_positions
)
==
len
(
masked_labels
)
# Tokens and token types.
filler
=
[
pad_id
]
*
padding_length
tokens_np
=
np
.
array
(
tokens
+
filler
,
dtype
=
np
.
int64
)
tokentypes_np
=
np
.
array
(
tokentypes
+
filler
,
dtype
=
np
.
int64
)
# Padding mask.
padding_mask_np
=
np
.
array
([
1
]
*
num_tokens
+
[
0
]
*
padding_length
,
dtype
=
np
.
int64
)
# Lables and loss mask.
labels
=
[
-
1
]
*
max_seq_length
loss_mask
=
[
0
]
*
max_seq_length
for
i
in
range
(
len
(
masked_positions
)):
assert
masked_positions
[
i
]
<
num_tokens
labels
[
masked_positions
[
i
]]
=
masked_labels
[
i
]
loss_mask
[
masked_positions
[
i
]]
=
1
labels_np
=
np
.
array
(
labels
,
dtype
=
np
.
int64
)
loss_mask_np
=
np
.
array
(
loss_mask
,
dtype
=
np
.
int64
)
return
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
def
build_train_valid_test_datasets_with_prefixes
(
train_valid_test_num_samples
,
max_seq_length
,
seed
,
train_data_prefix
=
None
,
valid_data_prefix
=
None
,
test_data_prefix
=
None
,
binary_head
=
False
,
max_seq_length_dec
=
None
,
dataset_type
=
'standard_bert'
):
print_rank_0
(
"Separate data paths provided for train, valid & test."
)
train_dataset
,
valid_dataset
,
test_dataset
=
None
,
None
,
None
# Single dataset.
if
train_data_prefix
is
not
None
:
train_dataset
=
build_dataset
(
"train"
,
train_data_prefix
,
train_valid_test_num_samples
[
0
],
max_seq_length
,
seed
,
binary_head
,
max_seq_length_dec
,
dataset_type
=
dataset_type
)
if
valid_data_prefix
is
not
None
:
valid_dataset
=
build_dataset
(
"valid"
,
valid_data_prefix
,
train_valid_test_num_samples
[
1
],
max_seq_length
,
seed
,
False
,
binary_head
,
max_seq_length_dec
,
dataset_type
=
dataset_type
)
if
test_data_prefix
is
not
None
:
test_dataset
=
build_dataset
(
"test"
,
test_data_prefix
,
train_valid_test_num_samples
[
2
],
max_seq_length
,
seed
,
False
,
binary_head
,
max_seq_length_dec
,
dataset_type
=
dataset_type
)
return
(
train_dataset
,
valid_dataset
,
test_dataset
)
def
build_train_valid_test_datasets
(
data_prefix
,
splits_string
,
train_valid_test_num_samples
,
max_seq_length
,
seed
,
binary_head
=
False
,
max_seq_length_dec
=
None
,
dataset_type
=
'standard_bert'
):
if
len
(
data_prefix
)
==
1
:
return
_build_train_valid_test_datasets
(
data_prefix
[
0
],
splits_string
,
train_valid_test_num_samples
,
max_seq_length
,
seed
,
binary_head
,
max_seq_length_dec
,
dataset_type
=
dataset_type
)
raise
NotImplementedError
(
"Blending currently unsupported for non-GPT dataset instances"
)
def
_build_train_valid_test_datasets
(
data_prefix
,
splits_string
,
train_valid_test_num_samples
,
max_seq_length
,
seed
,
binary_head
,
max_seq_length_dec
,
dataset_type
=
'standard_bert'
):
# Indexed dataset.
indexed_dataset
=
get_indexed_dataset_
(
data_prefix
,
dataset_type
)
# Get start and end indices of train/valid/train into doc-idx
# Note that doc-idx is desinged to be num-docs + 1 so we can
# easily iterate over it.
total_num_of_documents
=
indexed_dataset
.
document_indices
.
shape
[
0
]
-
1
splits
=
get_train_valid_test_split_
(
splits_string
,
total_num_of_documents
)
# Print stats about the splits.
print_rank_0
(
' > dataset split:'
)
def
print_split_stats
(
name
,
index
):
print_rank_0
(
' {}:'
.
format
(
name
))
print_rank_0
(
' document indices in [{}, {}) total of {} '
'documents'
.
format
(
splits
[
index
],
splits
[
index
+
1
],
splits
[
index
+
1
]
-
splits
[
index
]))
start_index
=
indexed_dataset
.
document_indices
[
splits
[
index
]]
end_index
=
indexed_dataset
.
document_indices
[
splits
[
index
+
1
]]
print_rank_0
(
' sentence indices in [{}, {}) total of {} '
'sentences'
.
format
(
start_index
,
end_index
,
end_index
-
start_index
))
print_split_stats
(
'train'
,
0
)
print_split_stats
(
'validation'
,
1
)
print_split_stats
(
'test'
,
2
)
def
build_split_dataset
(
index
,
name
):
dataset
=
None
if
splits
[
index
+
1
]
>
splits
[
index
]:
# Get the pointer to the original doc-idx so we can set it later.
doc_idx_ptr
=
indexed_dataset
.
get_document_indices
()
# Slice the doc-idx
start_index
=
splits
[
index
]
# Add +1 so we can index into the dataset to get the upper bound.
end_index
=
splits
[
index
+
1
]
+
1
# New doc_idx view.
indexed_dataset
.
set_document_indices
(
doc_idx_ptr
[
start_index
:
end_index
])
dataset
=
build_dataset
(
name
,
data_prefix
,
train_valid_test_num_samples
[
index
],
max_seq_length
,
seed
,
binary_head
,
max_seq_length_dec
,
dataset_type
,
indexed_dataset
)
# Set the original pointer so dataset remains the main dataset.
indexed_dataset
.
set_document_indices
(
doc_idx_ptr
)
# Checks.
assert
indexed_dataset
.
document_indices
[
0
]
==
0
assert
indexed_dataset
.
document_indices
.
shape
[
0
]
==
\
(
total_num_of_documents
+
1
)
return
dataset
train_dataset
=
build_split_dataset
(
0
,
'train'
)
valid_dataset
=
build_split_dataset
(
1
,
'valid'
)
test_dataset
=
build_split_dataset
(
2
,
'test'
)
return
(
train_dataset
,
valid_dataset
,
test_dataset
)
def
build_dataset
(
name
,
data_prefix
,
max_num_samples
,
max_seq_length
,
seed
,
binary_head
,
max_seq_length_dec
,
dataset_type
=
'standard_bert'
,
indexed_dataset
=
None
):
from
megatron.legacy.data.ict_dataset
import
ICTDataset
from
megatron.legacy.data.multimodal_dataset
import
MultiModalDataset
if
dataset_type
==
DSET_TYPE_BERT
or
dataset_type
==
DSET_TYPE_T5
:
raise
ValueError
(
"The Megatron-LM BERT and T5 datasets are deprecated."
)
if
dataset_type
not
in
DSET_TYPES
:
raise
ValueError
(
"Invalid dataset_type: "
,
dataset_type
)
if
indexed_dataset
is
None
:
indexed_dataset
=
get_indexed_dataset_
(
data_prefix
,
dataset_type
)
kwargs
=
dict
(
name
=
name
,
data_prefix
=
data_prefix
,
num_epochs
=
None
,
max_num_samples
=
max_num_samples
,
max_seq_length
=
max_seq_length
,
seed
=
seed
,
)
if
dataset_type
==
DSET_TYPE_ICT
:
args
=
get_args
()
title_dataset
=
get_indexed_dataset_
(
args
.
titles_data_path
,
dataset_type
)
dataset
=
ICTDataset
(
block_dataset
=
indexed_dataset
,
title_dataset
=
title_dataset
,
query_in_block_prob
=
args
.
query_in_block_prob
,
use_one_sent_docs
=
args
.
use_one_sent_docs
,
binary_head
=
binary_head
,
**
kwargs
)
elif
dataset_type
==
DSET_TYPE_MULTIMODAL
:
args
=
get_args
()
dataset
=
MultiModalDataset
(
name
=
name
,
data_prefix
=
data_prefix
,
indexed_dataset
=
indexed_dataset
,
num_samples
=
max_num_samples
,
seq_length
=
max_seq_length
,
seed
=
seed
,
img_h
=
args
.
img_h
,
img_w
=
args
.
img_w
,
)
else
:
raise
NotImplementedError
(
"Dataset type not fully implemented."
)
return
dataset
def
get_indexed_dataset_
(
data_prefix
,
dataset_type
):
print_rank_0
(
' > building dataset index ...'
)
start_time
=
time
.
time
()
multimodal
=
dataset_type
==
DSET_TYPE_MULTIMODAL
indexed_dataset
=
IndexedDataset
(
data_prefix
,
multimodal
)
assert
indexed_dataset
.
sequence_lengths
.
shape
[
0
]
==
indexed_dataset
.
document_indices
[
-
1
]
print_rank_0
(
' > finished creating indexed dataset in {:4f} '
'seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' > indexed dataset stats:'
)
print_rank_0
(
' number of documents: {}'
.
format
(
indexed_dataset
.
document_indices
.
shape
[
0
]
-
1
))
print_rank_0
(
' number of sentences: {}'
.
format
(
indexed_dataset
.
sequence_lengths
.
shape
[
0
]))
return
indexed_dataset
def
get_train_valid_test_split_
(
splits_string
,
size
):
""" Get dataset splits from comma or '/' separated string list."""
splits
=
[]
if
splits_string
.
find
(
','
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
splits_string
.
split
(
','
)]
elif
splits_string
.
find
(
'/'
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
splits_string
.
split
(
'/'
)]
else
:
splits
=
[
float
(
splits_string
)]
while
len
(
splits
)
<
3
:
splits
.
append
(
0.
)
splits
=
splits
[:
3
]
splits_sum
=
sum
(
splits
)
assert
splits_sum
>
0.0
splits
=
[
split
/
splits_sum
for
split
in
splits
]
splits_index
=
[
0
]
for
index
,
split
in
enumerate
(
splits
):
splits_index
.
append
(
splits_index
[
index
]
+
int
(
round
(
split
*
float
(
size
))))
diff
=
splits_index
[
-
1
]
-
size
for
index
in
range
(
1
,
len
(
splits_index
)):
splits_index
[
index
]
-=
diff
assert
len
(
splits_index
)
==
4
assert
splits_index
[
-
1
]
==
size
return
splits_index
def
get_samples_mapping
(
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
name
,
binary_head
):
"""Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""
if
not
num_epochs
:
if
not
max_num_samples
:
raise
ValueError
(
"Need to specify either max_num_samples "
"or num_epochs"
)
num_epochs
=
np
.
iinfo
(
np
.
int32
).
max
-
1
if
not
max_num_samples
:
max_num_samples
=
np
.
iinfo
(
np
.
int64
).
max
-
1
# Filename of the index mapping
indexmap_filename
=
data_prefix
indexmap_filename
+=
'_{}_indexmap'
.
format
(
name
)
if
num_epochs
!=
(
np
.
iinfo
(
np
.
int32
).
max
-
1
):
indexmap_filename
+=
'_{}ep'
.
format
(
num_epochs
)
if
max_num_samples
!=
(
np
.
iinfo
(
np
.
int64
).
max
-
1
):
indexmap_filename
+=
'_{}mns'
.
format
(
max_num_samples
)
indexmap_filename
+=
'_{}msl'
.
format
(
max_seq_length
)
indexmap_filename
+=
'_{:0.2f}ssp'
.
format
(
short_seq_prob
)
indexmap_filename
+=
'_{}s'
.
format
(
seed
)
indexmap_filename
+=
'.npy'
# Build the indexed mapping if not exist.
if
torch
.
distributed
.
get_rank
()
==
0
and
\
not
os
.
path
.
isfile
(
indexmap_filename
):
print
(
' > WARNING: could not find index map file {}, building '
'the indices on rank 0 ...'
.
format
(
indexmap_filename
))
# Make sure the types match the helpers input types.
assert
indexed_dataset
.
document_indices
.
dtype
==
np
.
int64
assert
indexed_dataset
.
sequence_lengths
.
dtype
==
np
.
int32
# Build samples mapping
verbose
=
torch
.
distributed
.
get_rank
()
==
0
start_time
=
time
.
time
()
print_rank_0
(
' > building samples index mapping for {} ...'
.
format
(
name
))
# First compile and then import.
from
megatron.core.datasets
import
helpers
samples_mapping
=
helpers
.
build_mapping
(
indexed_dataset
.
document_indices
,
indexed_dataset
.
sequence_lengths
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
verbose
,
2
if
binary_head
else
1
)
print_rank_0
(
' > done building samples index maping'
)
np
.
save
(
indexmap_filename
,
samples_mapping
,
allow_pickle
=
True
)
print_rank_0
(
' > saved the index mapping in {}'
.
format
(
indexmap_filename
))
# Make sure all the ranks have built the mapping
print_rank_0
(
' > elasped time to build and save samples mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
tensor
([
1
],
dtype
=
torch
.
long
,
device
=
'cuda'
)
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_pipeline_model_parallel_group
())
assert
counts
[
0
].
item
()
==
(
torch
.
distributed
.
get_world_size
()
//
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_tensor_model_parallel_group
()))
# Load indexed dataset.
print_rank_0
(
' > loading indexed mapping from {}'
.
format
(
indexmap_filename
))
start_time
=
time
.
time
()
samples_mapping
=
np
.
load
(
indexmap_filename
,
allow_pickle
=
True
,
mmap_mode
=
'r'
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
samples_mapping
.
shape
[
0
]))
return
samples_mapping
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/ict_dataset.py
0 → 100644
View file @
bc5c7fa7
import
itertools
import
random
import
numpy
as
np
from
torch.utils.data
import
Dataset
from
megatron.training
import
get_tokenizer
from
megatron.training
import
get_args
from
megatron.legacy.data.dataset_utils
import
get_indexed_dataset_
from
megatron.legacy.data.realm_dataset_utils
import
get_block_samples_mapping
def
make_attention_mask
(
source_block
,
target_block
):
"""
Returns a 2-dimensional (2-D) attention mask
:param source_block: 1-D array
:param target_block: 1-D array
"""
mask
=
(
target_block
[
None
,
:]
>=
1
)
*
(
source_block
[:,
None
]
>=
1
)
mask
=
mask
.
astype
(
np
.
int64
)
# (source_length, target_length)
return
mask
def
get_ict_dataset
(
use_titles
=
True
,
query_in_block_prob
=
1
):
"""Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block())
rather than for training, since it is only built with a single epoch sample mapping.
"""
args
=
get_args
()
block_dataset
=
get_indexed_dataset_
(
args
.
data_path
,
'mmap'
,
True
)
titles_dataset
=
get_indexed_dataset_
(
args
.
titles_data_path
,
'mmap'
,
True
)
kwargs
=
dict
(
name
=
'full'
,
block_dataset
=
block_dataset
,
title_dataset
=
titles_dataset
,
data_prefix
=
args
.
data_path
,
num_epochs
=
1
,
max_num_samples
=
None
,
max_seq_length
=
args
.
seq_length
,
seed
=
1
,
query_in_block_prob
=
query_in_block_prob
,
use_titles
=
use_titles
,
use_one_sent_docs
=
args
.
use_one_sent_docs
)
dataset
=
ICTDataset
(
**
kwargs
)
return
dataset
class
ICTDataset
(
Dataset
):
"""Dataset containing sentences and their blocks for an inverse cloze task."""
def
__init__
(
self
,
name
,
block_dataset
,
title_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
query_in_block_prob
,
seed
,
use_titles
=
True
,
use_one_sent_docs
=
False
,
binary_head
=
False
):
self
.
name
=
name
self
.
seed
=
seed
self
.
max_seq_length
=
max_seq_length
self
.
query_in_block_prob
=
query_in_block_prob
self
.
block_dataset
=
block_dataset
self
.
title_dataset
=
title_dataset
self
.
rng
=
random
.
Random
(
self
.
seed
)
self
.
use_titles
=
use_titles
self
.
use_one_sent_docs
=
use_one_sent_docs
self
.
samples_mapping
=
get_block_samples_mapping
(
block_dataset
,
title_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
name
,
use_one_sent_docs
)
self
.
tokenizer
=
get_tokenizer
()
self
.
vocab_id_list
=
list
(
self
.
tokenizer
.
inv_vocab
.
keys
())
self
.
vocab_id_to_token_list
=
self
.
tokenizer
.
inv_vocab
self
.
cls_id
=
self
.
tokenizer
.
cls
self
.
sep_id
=
self
.
tokenizer
.
sep
self
.
mask_id
=
self
.
tokenizer
.
mask
self
.
pad_id
=
self
.
tokenizer
.
pad
def
__len__
(
self
):
return
len
(
self
.
samples_mapping
)
def
__getitem__
(
self
,
idx
):
"""Get an ICT example of a pseudo-query and the block of text from which it was extracted"""
sample_data
=
self
.
samples_mapping
[
idx
]
start_idx
,
end_idx
,
doc_idx
,
block_idx
=
sample_data
.
as_tuple
()
if
self
.
use_titles
:
title
=
self
.
title_dataset
[
int
(
doc_idx
)]
title_pad_offset
=
3
+
len
(
title
)
else
:
title
=
None
title_pad_offset
=
2
block
=
[
self
.
block_dataset
[
i
]
for
i
in
range
(
start_idx
,
end_idx
)]
assert
len
(
block
)
>
1
or
self
.
use_one_sent_docs
or
self
.
query_in_block_prob
==
1
# randint() is inclusive for Python rng
rand_sent_idx
=
self
.
rng
.
randint
(
0
,
len
(
block
)
-
1
)
# keep the query in the context query_in_block_prob fraction of the time.
if
self
.
rng
.
random
()
<
self
.
query_in_block_prob
:
query
=
block
[
rand_sent_idx
].
copy
()
else
:
query
=
block
.
pop
(
rand_sent_idx
)
# still need to truncate because blocks are concluded when
# the sentence lengths have exceeded max_seq_length.
query
=
query
[:
self
.
max_seq_length
-
2
]
block
=
list
(
itertools
.
chain
(
*
block
))[:
self
.
max_seq_length
-
title_pad_offset
]
query_tokens
,
query_pad_mask
=
self
.
concat_and_pad_tokens
(
query
)
context_tokens
,
context_pad_mask
=
self
.
concat_and_pad_tokens
(
block
,
title
)
query_mask
=
make_attention_mask
(
query_tokens
,
query_tokens
)
context_mask
=
make_attention_mask
(
context_tokens
,
context_tokens
)
block_data
=
sample_data
.
as_array
()
sample
=
{
'query_tokens'
:
query_tokens
,
'query_mask'
:
query_mask
,
'query_pad_mask'
:
query_pad_mask
,
'context_tokens'
:
context_tokens
,
'context_mask'
:
context_mask
,
'context_pad_mask'
:
context_pad_mask
,
'block_data'
:
block_data
,
}
return
sample
def
get_block
(
self
,
start_idx
,
end_idx
,
doc_idx
):
"""Get the IDs for an evidence block plus the title of the corresponding document"""
block
=
[
self
.
block_dataset
[
i
]
for
i
in
range
(
start_idx
,
end_idx
)]
title
=
self
.
title_dataset
[
int
(
doc_idx
)]
block
=
list
(
itertools
.
chain
(
*
block
))[:
self
.
max_seq_length
-
(
3
+
len
(
title
))]
block_tokens
,
block_pad_mask
=
self
.
concat_and_pad_tokens
(
block
,
title
)
return
block_tokens
,
block_pad_mask
def
get_null_block
(
self
):
"""Get empty block and title - used in REALM pretraining"""
block
,
title
=
[],
[]
block_tokens
,
block_pad_mask
=
self
.
concat_and_pad_tokens
(
block
,
title
)
return
block_tokens
,
block_pad_mask
def
concat_and_pad_tokens
(
self
,
tokens
,
title
=
None
):
"""Concat with special tokens and pad sequence to self.max_seq_length"""
tokens
=
list
(
tokens
)
if
title
is
None
:
tokens
=
[
self
.
cls_id
]
+
tokens
+
[
self
.
sep_id
]
else
:
title
=
list
(
title
)
tokens
=
[
self
.
cls_id
]
+
title
+
[
self
.
sep_id
]
+
tokens
+
[
self
.
sep_id
]
assert
len
(
tokens
)
<=
self
.
max_seq_length
num_pad
=
self
.
max_seq_length
-
len
(
tokens
)
pad_mask
=
[
1
]
*
len
(
tokens
)
+
[
0
]
*
num_pad
tokens
+=
[
self
.
pad_id
]
*
num_pad
return
np
.
array
(
tokens
),
np
.
array
(
pad_mask
)
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/image_folder.py
0 → 100644
View file @
bc5c7fa7
# BSD 3-Clause License
#
# Copyright (c) Soumith Chintala 2016,
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# code taken from
# https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py
# added support for classes_fraction and data_per_class_fraction
from
torchvision.datasets
import
VisionDataset
from
PIL
import
Image
import
os
import
os.path
from
typing
import
Any
,
Callable
,
cast
,
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
def
has_file_allowed_extension
(
filename
:
str
,
extensions
:
Tuple
[
str
,
...])
->
bool
:
"""Checks if a file is an allowed extension.
Args:
filename (string): path to a file
extensions (tuple of strings): extensions to consider (lowercase)
Returns:
bool: True if the filename ends with one of given extensions
"""
return
filename
.
lower
().
endswith
(
extensions
)
def
is_image_file
(
filename
:
str
)
->
bool
:
"""Checks if a file is an allowed image extension.
Args:
filename (string): path to a file
Returns:
bool: True if the filename ends with a known image extension
"""
return
has_file_allowed_extension
(
filename
,
IMG_EXTENSIONS
)
def
make_dataset
(
directory
:
str
,
class_to_idx
:
Dict
[
str
,
int
],
data_per_class_fraction
:
float
,
extensions
:
Optional
[
Tuple
[
str
,
...]]
=
None
,
is_valid_file
:
Optional
[
Callable
[[
str
],
bool
]]
=
None
,
)
->
List
[
Tuple
[
str
,
int
]]:
"""Generates a list of samples of a form (path_to_sample, class).
Args:
directory (str): root dataset directory
class_to_idx (Dict[str, int]): dictionary mapping class name to class index
extensions (optional): A list of allowed extensions.
Either extensions or is_valid_file should be passed. Defaults to None.
is_valid_file (optional): A function that takes path of a file
and checks if the file is a valid file
(used to check of corrupt files) both extensions and
is_valid_file should not be passed. Defaults to None.
Raises:
ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None.
Returns:
List[Tuple[str, int]]: samples of a form (path_to_sample, class)
"""
instances
=
[]
directory
=
os
.
path
.
expanduser
(
directory
)
both_none
=
extensions
is
None
and
is_valid_file
is
None
both_something
=
extensions
is
not
None
and
is_valid_file
is
not
None
if
both_none
or
both_something
:
raise
ValueError
(
"Both extensions and is_valid_file cannot be None or not None at the same time"
)
if
extensions
is
not
None
:
def
is_valid_file
(
x
:
str
)
->
bool
:
return
has_file_allowed_extension
(
x
,
cast
(
Tuple
[
str
,
...],
extensions
))
is_valid_file
=
cast
(
Callable
[[
str
],
bool
],
is_valid_file
)
for
target_class
in
sorted
(
class_to_idx
.
keys
()):
class_index
=
class_to_idx
[
target_class
]
target_dir
=
os
.
path
.
join
(
directory
,
target_class
)
if
not
os
.
path
.
isdir
(
target_dir
):
continue
local_instances
=
[]
for
root
,
_
,
fnames
in
sorted
(
os
.
walk
(
target_dir
,
followlinks
=
True
)):
for
fname
in
sorted
(
fnames
):
path
=
os
.
path
.
join
(
root
,
fname
)
if
is_valid_file
(
path
):
item
=
path
,
class_index
local_instances
.
append
(
item
)
instances
.
extend
(
local_instances
[
0
:
int
(
len
(
local_instances
)
*
data_per_class_fraction
)])
return
instances
class
DatasetFolder
(
VisionDataset
):
"""A generic data loader where the samples are arranged in this way: ::
root/class_x/xxx.ext
root/class_x/xxy.ext
root/class_x/[...]/xxz.ext
root/class_y/123.ext
root/class_y/nsdf3.ext
root/class_y/[...]/asd932_.ext
Args:
root (string): Root directory path.
loader (callable): A function to load a sample given its path.
extensions (tuple[string]): A list of allowed extensions.
both extensions and is_valid_file should not be passed.
transform (callable, optional): A function/transform that takes in
a sample and returns a transformed version.
E.g, ``transforms.RandomCrop`` for images.
target_transform (callable, optional): A function/transform that takes
in the target and transforms it.
is_valid_file (callable, optional): A function that takes path of a file
and check if the file is a valid file (used to check of corrupt files)
both extensions and is_valid_file should not be passed.
Attributes:
classes (list): List of the class names sorted alphabetically.
class_to_idx (dict): Dict with items (class_name, class_index).
samples (list): List of (sample path, class_index) tuples
targets (list): The class_index value for each image in the dataset
"""
def
__init__
(
self
,
root
:
str
,
loader
:
Callable
[[
str
],
Any
],
extensions
:
Optional
[
Tuple
[
str
,
...]]
=
None
,
transform
:
Optional
[
Callable
]
=
None
,
target_transform
:
Optional
[
Callable
]
=
None
,
classes_fraction
=
1.0
,
data_per_class_fraction
=
1.0
,
is_valid_file
:
Optional
[
Callable
[[
str
],
bool
]]
=
None
,
)
->
None
:
super
(
DatasetFolder
,
self
).
__init__
(
root
,
transform
=
transform
,
target_transform
=
target_transform
)
self
.
classes_fraction
=
classes_fraction
self
.
data_per_class_fraction
=
data_per_class_fraction
classes
,
class_to_idx
=
self
.
_find_classes
(
self
.
root
)
samples
=
self
.
make_dataset
(
self
.
root
,
class_to_idx
,
self
.
data_per_class_fraction
,
extensions
,
is_valid_file
)
if
len
(
samples
)
==
0
:
msg
=
"Found 0 files in subfolders of: {}
\n
"
.
format
(
self
.
root
)
if
extensions
is
not
None
:
msg
+=
"Supported extensions are: {}"
.
format
(
","
.
join
(
extensions
))
raise
RuntimeError
(
msg
)
self
.
loader
=
loader
self
.
extensions
=
extensions
self
.
total
=
len
(
samples
)
self
.
classes
=
classes
self
.
class_to_idx
=
class_to_idx
self
.
samples
=
samples
self
.
targets
=
[
s
[
1
]
for
s
in
samples
]
@
staticmethod
def
make_dataset
(
directory
:
str
,
class_to_idx
:
Dict
[
str
,
int
],
data_per_class_fraction
:
float
,
extensions
:
Optional
[
Tuple
[
str
,
...]]
=
None
,
is_valid_file
:
Optional
[
Callable
[[
str
],
bool
]]
=
None
,
)
->
List
[
Tuple
[
str
,
int
]]:
return
make_dataset
(
directory
,
class_to_idx
,
data_per_class_fraction
,
extensions
=
extensions
,
is_valid_file
=
is_valid_file
)
def
_find_classes
(
self
,
dir
:
str
)
->
Tuple
[
List
[
str
],
Dict
[
str
,
int
]]:
"""
Finds the class folders in a dataset.
Args:
dir (string): Root directory path.
Returns:
tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary.
Ensures:
No class is a subdirectory of another.
"""
all_classes
=
[
d
.
name
for
d
in
os
.
scandir
(
dir
)
if
d
.
is_dir
()]
classes
=
all_classes
[
0
:
int
(
len
(
all_classes
)
*
self
.
classes_fraction
)]
classes
.
sort
()
class_to_idx
=
{
cls_name
:
i
for
i
,
cls_name
in
enumerate
(
classes
)}
return
classes
,
class_to_idx
def
__getitem__
(
self
,
index
:
int
)
->
Tuple
[
Any
,
Any
]:
"""
Args:
index (int): Index
Returns:
tuple: (sample, target) where target is class_index of the target class.
"""
curr_index
=
index
for
x
in
range
(
self
.
total
):
try
:
path
,
target
=
self
.
samples
[
curr_index
]
sample
=
self
.
loader
(
path
)
break
except
Exception
as
e
:
curr_index
=
np
.
random
.
randint
(
0
,
self
.
total
)
if
self
.
transform
is
not
None
:
sample
=
self
.
transform
(
sample
)
if
self
.
target_transform
is
not
None
:
target
=
self
.
target_transform
(
target
)
return
sample
,
target
def
__len__
(
self
)
->
int
:
return
len
(
self
.
samples
)
IMG_EXTENSIONS
=
(
'.jpg'
,
'.jpeg'
,
'.png'
,
'.ppm'
,
'.bmp'
,
'.pgm'
,
'.tif'
,
'.tiff'
,
'.webp'
)
def
pil_loader
(
path
:
str
)
->
Image
.
Image
:
# open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
with
open
(
path
,
'rb'
)
as
f
:
img
=
Image
.
open
(
f
)
return
img
.
convert
(
'RGB'
)
# TODO: specify the return type
def
accimage_loader
(
path
:
str
)
->
Any
:
import
accimage
try
:
return
accimage
.
Image
(
path
)
except
IOError
:
# Potentially a decoding problem, fall back to PIL.Image
return
pil_loader
(
path
)
def
default_loader
(
path
:
str
)
->
Any
:
from
torchvision
import
get_image_backend
if
get_image_backend
()
==
'accimage'
:
return
accimage_loader
(
path
)
else
:
return
pil_loader
(
path
)
class
ImageFolder
(
DatasetFolder
):
"""A generic data loader where the images are arranged in this way: ::
root/dog/xxx.png
root/dog/xxy.png
root/dog/[...]/xxz.png
root/cat/123.png
root/cat/nsdf3.png
root/cat/[...]/asd932_.png
Args:
root (string): Root directory path.
transform (callable, optional): A function/transform that takes in an PIL image
and returns a transformed version. E.g, ``transforms.RandomCrop``
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
loader (callable, optional): A function to load an image given its path.
is_valid_file (callable, optional): A function that takes path of an Image file
and check if the file is a valid file (used to check of corrupt files)
Attributes:
classes (list): List of the class names sorted alphabetically.
class_to_idx (dict): Dict with items (class_name, class_index).
imgs (list): List of (image path, class_index) tuples
"""
def
__init__
(
self
,
root
:
str
,
transform
:
Optional
[
Callable
]
=
None
,
target_transform
:
Optional
[
Callable
]
=
None
,
classes_fraction
=
1.0
,
data_per_class_fraction
=
1.0
,
loader
:
Callable
[[
str
],
Any
]
=
default_loader
,
is_valid_file
:
Optional
[
Callable
[[
str
],
bool
]]
=
None
,
):
super
(
ImageFolder
,
self
).
__init__
(
root
,
loader
,
IMG_EXTENSIONS
if
is_valid_file
is
None
else
None
,
transform
=
transform
,
target_transform
=
target_transform
,
classes_fraction
=
classes_fraction
,
data_per_class_fraction
=
data_per_class_fraction
,
is_valid_file
=
is_valid_file
)
self
.
imgs
=
self
.
samples
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/multimodal_dataset.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from
PIL
import
Image
,
UnidentifiedImageError
import
numpy
as
np
import
io
import
torch
try
:
from
torchvision.transforms
import
InterpolationMode
BICUBIC
=
InterpolationMode
.
BICUBIC
except
ImportError
:
BICUBIC
=
Image
.
BICUBIC
from
torchvision.transforms
import
Compose
,
ToTensor
,
Normalize
,
ToPILImage
,
RandomResizedCrop
,
Resize
def
_convert_image_to_rgb
(
image
):
return
image
.
convert
(
"RGB"
)
def
_transform
(
img_h
,
img_w
):
return
Compose
([
ToPILImage
(),
RandomResizedCrop
((
img_h
,
img_w
),
scale
=
(
0.5
,
1.0
),
interpolation
=
BICUBIC
),
_convert_image_to_rgb
,
ToTensor
(),
Normalize
((
0.48145466
,
0.4578275
,
0.40821073
),
(
0.26862954
,
0.26130258
,
0.27577711
)),
])
class
MultiModalDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
name
,
data_prefix
,
indexed_dataset
,
num_samples
,
seq_length
,
seed
,
img_h
,
img_w
):
self
.
name
=
name
self
.
indexed_dataset
=
indexed_dataset
self
.
doc_idx
=
indexed_dataset
.
get_document_indices
()
self
.
visual_transform
=
_transform
(
img_h
,
img_w
)
def
__len__
(
self
):
return
self
.
indexed_dataset
.
sequence_lengths
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
text_sample
,
mode
=
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
idx
])
assert
mode
==
0
img_sample
,
mode
=
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
idx
]
+
1
)
assert
mode
==
1
img_pad
=
img_sample
[
0
].
item
()
xs
=
img_sample
[
1
:].
tobytes
(
order
=
'C'
)
xs
=
xs
[:
len
(
xs
)
-
img_pad
]
img_sample
=
np
.
array
(
Image
.
open
(
io
.
BytesIO
(
xs
)))
img_sample
=
self
.
visual_transform
(
img_sample
).
reshape
(
-
1
)
return
{
'text'
:
np
.
array
(
text_sample
,
dtype
=
np
.
int64
),
'img'
:
np
.
array
(
img_sample
,
dtype
=
np
.
float32
)}
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/orqa_wiki_dataset.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Wikipedia dataset from DPR code for ORQA."""
from
abc
import
ABC
import
csv
import
numpy
as
np
import
random
import
torch
from
torch.utils.data
import
Dataset
from
megatron.training
import
print_rank_0
,
get_args
,
get_tokenizer
from
megatron.core
import
tensor_parallel
from
megatron.legacy.data.biencoder_dataset_utils
import
make_attention_mask
def
get_open_retrieval_wiki_dataset
():
args
=
get_args
()
tokenizer
=
get_tokenizer
()
dataset
=
OpenRetrievalEvidenceDataset
(
'2018 Wikipedia from DPR codebase'
,
'evidence'
,
args
.
evidence_data_path
,
tokenizer
,
args
.
retriever_seq_length
)
return
dataset
def
get_open_retrieval_batch
(
data_iterator
):
# Items and their type.
keys
=
[
'row_id'
,
'context'
,
'context_mask'
,
'context_types'
,
'context_pad_mask'
]
datatype
=
torch
.
int64
# Broadcast data.
data
=
None
if
data_iterator
is
None
else
next
(
data_iterator
)
data_b
=
tensor_parallel
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
row_id
=
data_b
[
'row_id'
].
long
()
context
=
data_b
[
'context'
].
long
()
# TODO: make the context mask a binary one
context_mask
=
(
data_b
[
'context_mask'
]
<
0.5
)
context_types
=
data_b
[
'context_types'
].
long
()
context_pad_mask
=
data_b
[
'context_pad_mask'
].
long
()
return
row_id
,
context
,
context_mask
,
context_types
,
context_pad_mask
def
build_tokens_types_paddings_from_text
(
row
,
tokenizer
,
max_seq_length
):
"""Build token types and paddings, trim if needed, and pad if needed."""
title_ids
=
tokenizer
.
tokenize
(
row
[
'title'
])
context_ids
=
tokenizer
.
tokenize
(
row
[
'text'
])
# Appending the title of the context at front
extended_context_ids
=
title_ids
+
[
tokenizer
.
sep_id
]
+
context_ids
context_ids
,
context_types
,
context_pad_mask
=
\
build_tokens_types_paddings_from_ids
(
extended_context_ids
,
max_seq_length
,
tokenizer
.
cls
,
tokenizer
.
sep
,
tokenizer
.
pad
)
return
context_ids
,
context_types
,
context_pad_mask
# noinspection DuplicatedCode
def
build_tokens_types_paddings_from_ids
(
text_ids
,
max_seq_length
,
cls_id
,
sep_id
,
pad_id
):
"""Build token types and paddings, trim if needed, and pad if needed."""
enc_ids
=
[]
tokentypes_enc
=
[]
# [CLS].
enc_ids
.
append
(
cls_id
)
tokentypes_enc
.
append
(
0
)
# A.
len_src
=
len
(
text_ids
)
enc_ids
.
extend
(
text_ids
)
tokentypes_enc
.
extend
([
0
]
*
len_src
)
# Cap the size.
if
len
(
enc_ids
)
>
max_seq_length
-
1
:
enc_ids
=
enc_ids
[
0
:
max_seq_length
-
1
]
tokentypes_enc
=
tokentypes_enc
[
0
:
max_seq_length
-
1
]
# [SEP].
enc_ids
.
append
(
sep_id
)
tokentypes_enc
.
append
(
0
)
num_tokens_enc
=
len
(
enc_ids
)
# Padding.
padding_length
=
max_seq_length
-
len
(
enc_ids
)
if
padding_length
>
0
:
enc_ids
.
extend
([
pad_id
]
*
padding_length
)
tokentypes_enc
.
extend
([
pad_id
]
*
padding_length
)
pad_mask
=
([
1
]
*
num_tokens_enc
)
+
([
0
]
*
padding_length
)
pad_mask
=
np
.
array
(
pad_mask
,
dtype
=
np
.
int64
)
return
enc_ids
,
tokentypes_enc
,
pad_mask
def
build_sample
(
row_id
,
context_ids
,
context_types
,
context_pad_mask
):
"""Convert to numpy and return a sample consumed by the batch producer."""
context_ids
=
np
.
array
(
context_ids
,
dtype
=
np
.
int64
)
context_types
=
np
.
array
(
context_types
,
dtype
=
np
.
int64
)
context_mask
=
make_attention_mask
(
context_ids
,
context_ids
)
sample
=
({
'row_id'
:
row_id
,
'context'
:
context_ids
,
'context_mask'
:
context_mask
,
'context_types'
:
context_types
,
'context_pad_mask'
:
context_pad_mask
})
return
sample
class
OpenRetrievalEvidenceDataset
(
ABC
,
Dataset
):
"""Open Retrieval Evidence dataset class."""
def
__init__
(
self
,
task_name
,
dataset_name
,
datapath
,
tokenizer
,
max_seq_length
):
# Store inputs.
self
.
task_name
=
task_name
self
.
dataset_name
=
dataset_name
self
.
tokenizer
=
tokenizer
self
.
max_seq_length
=
max_seq_length
print_rank_0
(
' > building {} dataset for {}:'
.
format
(
self
.
task_name
,
self
.
dataset_name
))
# Process the files.
print_rank_0
(
datapath
)
self
.
samples
,
self
.
id2text
=
self
.
process_samples_from_single_path
(
datapath
)
args
=
get_args
()
if
args
.
sample_rate
<
1
:
# subsample
k
=
int
(
len
(
self
.
samples
)
*
args
.
sample_rate
)
self
.
samples
=
random
.
sample
(
self
.
samples
,
k
)
print_rank_0
(
' >> total number of samples: {}'
.
format
(
len
(
self
.
samples
)))
def
__len__
(
self
):
return
len
(
self
.
samples
)
def
__getitem__
(
self
,
idx
):
row
=
self
.
samples
[
idx
]
context_ids
,
context_types
,
context_pad_mask
=
\
build_tokens_types_paddings_from_text
(
row
,
self
.
tokenizer
,
self
.
max_seq_length
)
sample
=
build_sample
(
row
[
'doc_id'
],
context_ids
,
context_types
,
context_pad_mask
)
return
sample
@
staticmethod
def
process_samples_from_single_path
(
filename
):
print_rank_0
(
' > Processing {} ...'
.
format
(
filename
))
total
=
0
rows
=
[]
id2text
=
{}
with
open
(
filename
)
as
tsvfile
:
reader
=
csv
.
reader
(
tsvfile
,
delimiter
=
'
\t
'
)
next
(
reader
,
None
)
# skip the headers
for
row
in
reader
:
# file format: doc_id, doc_text, title
doc_id
=
int
(
row
[
0
])
text
=
row
[
1
]
title
=
row
[
2
]
rows
.
append
({
'doc_id'
:
doc_id
,
'text'
:
text
,
'title'
:
title
})
assert
doc_id
not
in
id2text
id2text
[
doc_id
]
=
(
text
,
title
)
total
+=
1
if
total
%
100000
==
0
:
print_rank_0
(
' > processed {} rows so far ...'
.
format
(
total
))
print_rank_0
(
' >> processed {} samples.'
.
format
(
len
(
rows
)))
return
rows
,
id2text
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/realm_dataset_utils.py
0 → 100644
View file @
bc5c7fa7
import
os
import
time
import
numpy
as
np
import
torch
from
megatron.training
import
print_rank_0
from
megatron.core
import
mpu
,
tensor_parallel
from
megatron.legacy.data.dataset_utils
import
create_masked_lm_predictions
,
pad_and_convert_to_numpy
from
megatron.training
import
get_args
,
get_tokenizer
,
print_rank_0
def
get_one_epoch_dataloader
(
dataset
,
micro_batch_size
=
None
):
"""Specifically one epoch to be used in an indexing job."""
args
=
get_args
()
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
if
micro_batch_size
is
None
:
micro_batch_size
=
args
.
micro_batch_size
global_batch_size
=
micro_batch_size
*
world_size
num_workers
=
args
.
num_workers
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
# importantly, drop_last must be False to get all the data.
assert
False
,
'DistributedBatchSampler deprecated, change the implementation'
from
megatron.legacy.data.samplers
import
DistributedBatchSampler
batch_sampler
=
DistributedBatchSampler
(
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
False
,
rank
=
rank
,
world_size
=
world_size
)
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
def
get_ict_batch
(
data_iterator
):
# Items and their type.
keys
=
[
'query_tokens'
,
'query_pad_mask'
,
'block_tokens'
,
'block_pad_mask'
,
'block_data'
]
datatype
=
torch
.
int64
# Broadcast data.
if
data_iterator
is
None
:
data
=
None
else
:
data
=
next
(
data_iterator
)
data_b
=
tensor_parallel
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
query_tokens
=
data_b
[
'query_tokens'
].
long
()
query_pad_mask
=
data_b
[
'query_pad_mask'
].
long
()
block_tokens
=
data_b
[
'block_tokens'
].
long
()
block_pad_mask
=
data_b
[
'block_pad_mask'
].
long
()
block_indices
=
data_b
[
'block_data'
].
long
()
return
query_tokens
,
query_pad_mask
,
\
block_tokens
,
block_pad_mask
,
block_indices
def
join_str_list
(
str_list
):
"""Join a list of strings, handling spaces appropriately"""
result
=
""
for
s
in
str_list
:
if
s
.
startswith
(
"##"
):
result
+=
s
[
2
:]
else
:
result
+=
" "
+
s
return
result
class
BlockSampleData
(
object
):
"""A struct for fully describing a fixed-size block of data as used in REALM
:param start_idx: for first sentence of the block
:param end_idx: for last sentence of the block (may be partially truncated in sample construction)
:param doc_idx: the index of the document from which the block comes in the original indexed dataset
:param block_idx: a unique integer identifier given to every block.
"""
def
__init__
(
self
,
start_idx
,
end_idx
,
doc_idx
,
block_idx
):
self
.
start_idx
=
start_idx
self
.
end_idx
=
end_idx
self
.
doc_idx
=
doc_idx
self
.
block_idx
=
block_idx
def
as_array
(
self
):
return
np
.
array
([
self
.
start_idx
,
self
.
end_idx
,
self
.
doc_idx
,
self
.
block_idx
]).
astype
(
np
.
int64
)
def
as_tuple
(
self
):
return
self
.
start_idx
,
self
.
end_idx
,
self
.
doc_idx
,
self
.
block_idx
class
BlockSamplesMapping
(
object
):
def
__init__
(
self
,
mapping_array
):
# make sure that the array is compatible with BlockSampleData
assert
mapping_array
.
shape
[
1
]
==
4
self
.
mapping_array
=
mapping_array
def
__len__
(
self
):
return
self
.
mapping_array
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
"""Get the data associated with an indexed sample."""
sample_data
=
BlockSampleData
(
*
self
.
mapping_array
[
idx
])
return
sample_data
def
get_block_samples_mapping
(
block_dataset
,
title_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
name
,
use_one_sent_docs
=
False
):
"""Get samples mapping for a dataset over fixed size blocks. This function also requires
a dataset of the titles for the source documents since their lengths must be taken into account.
:return: samples_mapping (BlockSamplesMapping)
"""
if
not
num_epochs
:
if
not
max_num_samples
:
raise
ValueError
(
"Need to specify either max_num_samples "
"or num_epochs"
)
num_epochs
=
np
.
iinfo
(
np
.
int32
).
max
-
1
if
not
max_num_samples
:
max_num_samples
=
np
.
iinfo
(
np
.
int64
).
max
-
1
# Filename of the index mapping
indexmap_filename
=
data_prefix
indexmap_filename
+=
'_{}_indexmap'
.
format
(
name
)
if
num_epochs
!=
(
np
.
iinfo
(
np
.
int32
).
max
-
1
):
indexmap_filename
+=
'_{}ep'
.
format
(
num_epochs
)
if
max_num_samples
!=
(
np
.
iinfo
(
np
.
int64
).
max
-
1
):
indexmap_filename
+=
'_{}mns'
.
format
(
max_num_samples
)
indexmap_filename
+=
'_{}msl'
.
format
(
max_seq_length
)
indexmap_filename
+=
'_{}s'
.
format
(
seed
)
if
use_one_sent_docs
:
indexmap_filename
+=
'_1sentok'
indexmap_filename
+=
'.npy'
# Build the indexed mapping if not exist.
if
mpu
.
get_data_parallel_rank
()
==
0
and
\
not
os
.
path
.
isfile
(
indexmap_filename
):
print
(
' > WARNING: could not find index map file {}, building '
'the indices on rank 0 ...'
.
format
(
indexmap_filename
))
# Make sure the types match the helpers input types.
assert
block_dataset
.
document_indices
.
dtype
==
np
.
int64
assert
block_dataset
.
sequence_lengths
.
dtype
==
np
.
int32
# Build samples mapping
verbose
=
torch
.
distributed
.
get_rank
()
==
0
start_time
=
time
.
time
()
print_rank_0
(
' > building samples index mapping for {} ...'
.
format
(
name
))
from
megatron.core.datasets
import
helpers
mapping_array
=
helpers
.
build_blocks_mapping
(
block_dataset
.
document_indices
,
block_dataset
.
sequence_lengths
,
title_dataset
.
sequence_lengths
,
num_epochs
,
max_num_samples
,
max_seq_length
-
3
,
# account for added tokens
seed
,
verbose
,
use_one_sent_docs
)
print_rank_0
(
' > done building samples index mapping'
)
np
.
save
(
indexmap_filename
,
mapping_array
,
allow_pickle
=
True
)
print_rank_0
(
' > saved the index mapping in {}'
.
format
(
indexmap_filename
))
# Make sure all the ranks have built the mapping
print_rank_0
(
' > elapsed time to build and save samples mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
tensor
([
1
],
dtype
=
torch
.
long
,
device
=
'cuda'
)
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
assert
counts
[
0
].
item
()
==
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
# Load indexed dataset.
print_rank_0
(
' > loading indexed mapping from {}'
.
format
(
indexmap_filename
))
start_time
=
time
.
time
()
mapping_array
=
np
.
load
(
indexmap_filename
,
allow_pickle
=
True
,
mmap_mode
=
'r'
)
samples_mapping
=
BlockSamplesMapping
(
mapping_array
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
mapping_array
.
shape
[
0
]))
return
samples_mapping
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/realm_index.py
0 → 100644
View file @
bc5c7fa7
import
itertools
import
os
import
pickle
import
shutil
import
numpy
as
np
import
torch
from
megatron.training
import
get_args
from
megatron.core
import
mpu
def
detach
(
tensor
):
return
tensor
.
detach
().
cpu
().
numpy
()
class
OpenRetreivalDataStore
(
object
):
"""
Serializable data structure for holding data for blocks --
embeddings and necessary metadata for Retriever
"""
def
__init__
(
self
,
embedding_path
=
None
,
load_from_path
=
True
,
rank
=
None
):
self
.
embed_data
=
dict
()
if
embedding_path
is
None
:
args
=
get_args
()
embedding_path
=
args
.
embedding_path
rank
=
args
.
rank
self
.
embedding_path
=
embedding_path
self
.
rank
=
rank
if
load_from_path
:
self
.
load_from_file
()
block_data_name
=
os
.
path
.
splitext
(
self
.
embedding_path
)[
0
]
self
.
temp_dir_name
=
block_data_name
+
'_tmp'
def
state
(
self
):
return
{
'embed_data'
:
self
.
embed_data
,
}
def
clear
(
self
):
"""
Clear the embedding data structures to save memory.
The metadata ends up getting used, and is also much smaller in
dimensionality so it isn't really worth clearing.
"""
self
.
embed_data
=
dict
()
def
load_from_file
(
self
):
"""Populate members from instance saved to file"""
if
not
mpu
.
model_parallel_is_initialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
"
\n
> Unpickling BlockData"
,
flush
=
True
)
state_dict
=
pickle
.
load
(
open
(
self
.
embedding_path
,
'rb'
))
if
not
mpu
.
model_parallel_is_initialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
">> Finished unpickling BlockData
\n
"
,
flush
=
True
)
self
.
embed_data
=
state_dict
[
'embed_data'
]
def
add_block_data
(
self
,
row_id
,
block_embeds
,
allow_overwrite
=
False
):
"""
Add data for set of blocks
:param row_id: 1D array of unique int ids for the blocks
:param block_embeds: 2D array of embeddings of the blocks
In the case of retriever this will be [start_idx, end_idx, doc_idx]
"""
for
idx
,
embed
in
zip
(
row_id
,
block_embeds
):
if
not
allow_overwrite
and
idx
in
self
.
embed_data
:
raise
ValueError
(
"Unexpectedly tried to overwrite block data"
)
self
.
embed_data
[
idx
]
=
np
.
float16
(
embed
)
def
save_shard
(
self
):
"""
Save the block data that was created this in this process
"""
if
not
os
.
path
.
isdir
(
self
.
temp_dir_name
):
os
.
makedirs
(
self
.
temp_dir_name
,
exist_ok
=
True
)
# save the data for each shard
with
open
(
'{}/{}.pkl'
.
format
(
self
.
temp_dir_name
,
self
.
rank
),
'wb'
)
\
as
writer
:
pickle
.
dump
(
self
.
state
(),
writer
)
def
merge_shards_and_save
(
self
):
#Combine all the shards made using save_shard
shard_names
=
os
.
listdir
(
self
.
temp_dir_name
)
seen_own_shard
=
False
for
fname
in
os
.
listdir
(
self
.
temp_dir_name
):
shard_rank
=
int
(
os
.
path
.
splitext
(
fname
)[
0
])
if
shard_rank
==
self
.
rank
:
seen_own_shard
=
True
continue
with
open
(
'{}/{}'
.
format
(
self
.
temp_dir_name
,
fname
),
'rb'
)
as
f
:
data
=
pickle
.
load
(
f
)
old_size
=
len
(
self
.
embed_data
)
shard_size
=
len
(
data
[
'embed_data'
])
# add the shard's data and check to make sure there
# is no overlap
self
.
embed_data
.
update
(
data
[
'embed_data'
])
assert
len
(
self
.
embed_data
)
==
old_size
+
shard_size
assert
seen_own_shard
# save the consolidated shards and remove temporary directory
with
open
(
self
.
embedding_path
,
'wb'
)
as
final_file
:
pickle
.
dump
(
self
.
state
(),
final_file
)
shutil
.
rmtree
(
self
.
temp_dir_name
,
ignore_errors
=
True
)
print
(
"Finished merging {} shards for a total of {} embeds"
.
format
(
len
(
shard_names
),
len
(
self
.
embed_data
)),
flush
=
True
)
class
FaissMIPSIndex
(
object
):
"""
Wrapper object for a BlockData which similarity search via FAISS under the hood
"""
def
__init__
(
self
,
embed_size
,
embed_data
=
None
,
use_gpu
=
False
):
self
.
embed_size
=
embed_size
self
.
embed_data
=
embed_data
self
.
use_gpu
=
use_gpu
self
.
mips_index
=
None
self
.
_set_mips_index
()
def
_set_mips_index
(
self
):
"""
Create a Faiss Flat index with inner product as the metric
to search against
"""
try
:
import
faiss
except
ImportError
:
raise
Exception
(
"Error: Please install faiss to use FaissMIPSIndex"
)
if
not
mpu
.
model_parallel_is_initialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
"
\n
> Building index"
,
flush
=
True
)
cpu_index
=
faiss
.
IndexFlatIP
(
self
.
embed_size
)
if
self
.
use_gpu
:
# create resources and config for GpuIndex
config
=
faiss
.
GpuMultipleClonerOptions
()
config
.
shard
=
True
config
.
useFloat16
=
True
gpu_index
=
faiss
.
index_cpu_to_all_gpus
(
cpu_index
,
co
=
config
)
self
.
mips_index
=
faiss
.
IndexIDMap
(
gpu_index
)
if
not
mpu
.
model_parallel_is_initialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
">> Initialized index on GPU"
,
flush
=
True
)
else
:
# CPU index supports IDs so wrap with IDMap
self
.
mips_index
=
faiss
.
IndexIDMap
(
cpu_index
)
if
not
mpu
.
model_parallel_is_initialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
">> Initialized index on CPU"
,
flush
=
True
)
# if we were constructed with a BlockData, then automatically load it
# when the FAISS structure is built
if
self
.
embed_data
is
not
None
:
self
.
add_embed_data
(
self
.
embed_data
)
def
reset_index
(
self
):
"""Delete existing index and create a new"""
del
self
.
mips_index
# reset the block data so that _set_block_index will reload it as well
if
self
.
embed_data
is
not
None
:
embed_data_path
=
self
.
embed_data
.
embedding_path
del
self
.
embed_data
self
.
embed_data
=
OpenRetreivalDataStore
(
embed_data_path
)
self
.
_set_mips_index
()
def
update_index
(
self
):
"""Delete existing index and create a new"""
del
self
.
mips_index
# reset the block data so that _set_mips_index will reload it as well
if
self
.
embed_data
is
not
None
:
self
.
embed_data
.
load_from_file
()
self
.
_set_mips_index
()
def
add_embed_data
(
self
,
all_embed_data
):
"""Add the embedding of each block to the underlying FAISS index"""
# this assumes the embed_data is a dict : {int: np.array<float>}
block_indices
,
block_embeds
=
zip
(
*
all_embed_data
.
embed_data
.
items
())
# the embeddings have to be entered in as float32 even though the math
# internally is done with float16.
embeds_arr
=
np
.
float32
(
np
.
array
(
block_embeds
))
indices_arr
=
np
.
array
(
block_indices
)
# we no longer need the embedding data since it's in the index now
all_embed_data
.
clear
()
self
.
mips_index
.
add_with_ids
(
embeds_arr
,
indices_arr
)
if
not
mpu
.
model_parallel_is_initialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
">>> Finished adding block data to index"
,
flush
=
True
)
def
search_mips_index
(
self
,
query_embeds
,
top_k
,
reconstruct
=
True
):
"""
Get the top-k blocks by the index distance metric.
:param reconstruct: if True: return a [num_queries x k x embed_dim]
array of blocks
if False: return [num_queries x k] array of
distances, and another for indices
"""
query_embeds
=
np
.
float32
(
detach
(
query_embeds
))
if
reconstruct
:
# get the vectors themselves
top_k_block_embeds
=
self
.
mips_index
.
search_and_reconstruct
(
\
query_embeds
,
top_k
)
return
top_k_block_embeds
else
:
# get distances and indices of closest vectors
distances
,
block_indices
=
self
.
mips_index
.
search
(
query_embeds
,
top_k
)
return
distances
,
block_indices
Megatron-LM-core_r0.7.0.beta/megatron/legacy/data/vit_dataset.py
0 → 100644
View file @
bc5c7fa7
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
os
import
random
import
numpy
as
np
import
torch
import
torchvision.transforms
as
T
from
torchvision
import
datasets
from
megatron.training
import
get_args
from
megatron.legacy.data.image_folder
import
ImageFolder
from
megatron.legacy.data.autoaugment
import
ImageNetPolicy
from
megatron.legacy.data.data_samplers
import
RandomSeedDataset
from
PIL
import
Image
,
ImageFilter
,
ImageOps
class
GaussianBlur
(
object
):
"""
Apply Gaussian Blur to the PIL image.
"""
def
__init__
(
self
,
p
=
0.5
,
radius_min
=
0.1
,
radius_max
=
2.
):
self
.
prob
=
p
self
.
radius_min
=
radius_min
self
.
radius_max
=
radius_max
def
__call__
(
self
,
img
):
do_it
=
random
.
random
()
<=
self
.
prob
if
not
do_it
:
return
img
return
img
.
filter
(
ImageFilter
.
GaussianBlur
(
radius
=
random
.
uniform
(
self
.
radius_min
,
self
.
radius_max
)
)
)
class
Solarization
(
object
):
"""
Apply Solarization to the PIL image.
"""
def
__init__
(
self
,
p
):
self
.
p
=
p
def
__call__
(
self
,
img
):
if
random
.
random
()
<
self
.
p
:
return
ImageOps
.
solarize
(
img
)
else
:
return
img
class
ClassificationTransform
():
def
__init__
(
self
,
image_size
,
train
=
True
):
args
=
get_args
()
assert
args
.
fp16
or
args
.
bf16
self
.
data_type
=
torch
.
half
if
args
.
fp16
else
torch
.
bfloat16
if
train
:
self
.
transform
=
T
.
Compose
([
T
.
RandomResizedCrop
(
image_size
),
T
.
RandomHorizontalFlip
(),
T
.
ColorJitter
(
0.4
,
0.4
,
0.4
,
0.1
),
ImageNetPolicy
(),
T
.
ToTensor
(),
T
.
Normalize
((
0.485
,
0.456
,
0.406
),
(
0.229
,
0.224
,
0.225
)),
T
.
ConvertImageDtype
(
self
.
data_type
)
])
else
:
self
.
transform
=
T
.
Compose
([
T
.
Resize
(
image_size
),
T
.
CenterCrop
(
image_size
),
T
.
ToTensor
(),
T
.
Normalize
((
0.485
,
0.456
,
0.406
),
(
0.229
,
0.224
,
0.225
)),
T
.
ConvertImageDtype
(
self
.
data_type
)
])
def
__call__
(
self
,
input
):
output
=
self
.
transform
(
input
)
return
output
class
InpaintingTransform
():
def
__init__
(
self
,
image_size
,
train
=
True
):
args
=
get_args
()
self
.
mask_factor
=
args
.
mask_factor
self
.
mask_type
=
args
.
mask_type
self
.
image_size
=
image_size
self
.
patch_size
=
args
.
patch_dim
self
.
mask_size
=
int
(
self
.
mask_factor
*
(
image_size
[
0
]
/
self
.
patch_size
)
*
(
image_size
[
1
]
/
self
.
patch_size
))
self
.
train
=
train
assert
args
.
fp16
or
args
.
bf16
self
.
data_type
=
torch
.
half
if
args
.
fp16
else
torch
.
bfloat16
if
self
.
train
:
self
.
transform
=
T
.
Compose
([
T
.
RandomResizedCrop
(
self
.
image_size
),
T
.
RandomHorizontalFlip
(),
T
.
ColorJitter
(
0.4
,
0.4
,
0.4
,
0.1
),
ImageNetPolicy
(),
T
.
ToTensor
(),
T
.
ConvertImageDtype
(
self
.
data_type
)
])
else
:
self
.
transform
=
T
.
Compose
([
T
.
Resize
(
self
.
image_size
,
interpolation
=
2
),
T
.
CenterCrop
(
self
.
image_size
),
T
.
ToTensor
(),
T
.
ConvertImageDtype
(
self
.
data_type
)
])
def
gen_mask
(
self
,
image_size
,
mask_size
,
mask_type
,
patch_size
):
# output: mask as a list with indices for missing patches
action_list
=
[[
0
,
1
],
[
0
,
-
1
],
[
1
,
0
],
[
-
1
,
0
]]
assert
image_size
[
0
]
==
image_size
[
1
]
img_size_patch
=
image_size
[
0
]
//
patch_size
# drop masked patches
mask
=
torch
.
zeros
((
image_size
[
0
],
image_size
[
1
]),
dtype
=
torch
.
float
)
if
mask_type
==
'random'
:
x
=
torch
.
randint
(
0
,
img_size_patch
,
())
y
=
torch
.
randint
(
0
,
img_size_patch
,
())
for
i
in
range
(
mask_size
):
r
=
torch
.
randint
(
0
,
len
(
action_list
),
())
x
=
torch
.
clamp
(
x
+
action_list
[
r
][
0
],
min
=
0
,
max
=
img_size_patch
-
1
)
y
=
torch
.
clamp
(
y
+
action_list
[
r
][
1
],
min
=
0
,
max
=
img_size_patch
-
1
)
x_offset
=
x
*
patch_size
y_offset
=
y
*
patch_size
mask
[
x_offset
:
x_offset
+
patch_size
,
y_offset
:
y_offset
+
patch_size
]
=
1
else
:
assert
mask_type
==
'row'
count
=
0
for
x
in
reversed
(
range
(
img_size_patch
)):
for
y
in
reversed
(
range
(
img_size_patch
)):
if
(
count
<
mask_size
):
count
+=
1
x_offset
=
x
*
patch_size
y_offset
=
y
*
patch_size
mask
[
x_offset
:
x_offset
+
patch_size
,
y_offset
:
y_offset
+
patch_size
]
=
1
return
mask
def
__call__
(
self
,
input
):
trans_input
=
self
.
transform
(
input
)
mask
=
self
.
gen_mask
(
self
.
image_size
,
self
.
mask_size
,
self
.
mask_type
,
self
.
patch_size
)
mask
=
mask
.
unsqueeze
(
dim
=
0
)
return
trans_input
,
mask
class
DinoTransform
(
object
):
def
__init__
(
self
,
image_size
,
train
=
True
):
args
=
get_args
()
self
.
data_type
=
torch
.
half
if
args
.
fp16
else
torch
.
bfloat16
flip_and_color_jitter
=
T
.
Compose
([
T
.
RandomHorizontalFlip
(
p
=
0.5
),
T
.
RandomApply
(
[
T
.
ColorJitter
(
brightness
=
0.4
,
contrast
=
0.4
,
saturation
=
0.2
,
hue
=
0.1
)],
p
=
0.8
),
T
.
RandomGrayscale
(
p
=
0.2
),
])
if
args
.
fp16
or
args
.
bf16
:
normalize
=
T
.
Compose
([
T
.
ToTensor
(),
T
.
Normalize
((
0.485
,
0.456
,
0.406
),
(
0.229
,
0.224
,
0.225
)),
T
.
ConvertImageDtype
(
self
.
data_type
)
])
else
:
normalize
=
T
.
Compose
([
T
.
ToTensor
(),
T
.
Normalize
((
0.485
,
0.456
,
0.406
),
(
0.229
,
0.224
,
0.225
)),
])
# first global crop
scale_const
=
0.4
self
.
global_transform1
=
T
.
Compose
([
T
.
RandomResizedCrop
(
image_size
,
scale
=
(
scale_const
,
1
),
interpolation
=
Image
.
BICUBIC
),
flip_and_color_jitter
,
GaussianBlur
(
1.0
),
normalize
])
# second global crop
self
.
global_transform2
=
T
.
Compose
([
T
.
RandomResizedCrop
(
image_size
,
scale
=
(
scale_const
,
1
),
interpolation
=
Image
.
BICUBIC
),
flip_and_color_jitter
,
GaussianBlur
(
0.1
),
Solarization
(
0.2
),
normalize
])
# transformation for the local small crops
self
.
local_crops_number
=
args
.
dino_local_crops_number
self
.
local_transform
=
T
.
Compose
([
T
.
RandomResizedCrop
(
args
.
dino_local_img_size
,
scale
=
(
0.05
,
scale_const
),
interpolation
=
Image
.
BICUBIC
),
flip_and_color_jitter
,
GaussianBlur
(
p
=
0.5
),
normalize
])
def
__call__
(
self
,
image
):
crops
=
[]
crops
.
append
(
self
.
global_transform1
(
image
))
crops
.
append
(
self
.
global_transform2
(
image
))
for
_
in
range
(
self
.
local_crops_number
):
crops
.
append
(
self
.
local_transform
(
image
))
return
crops
def
build_train_valid_datasets
(
data_path
,
image_size
=
224
):
args
=
get_args
()
if
args
.
vision_pretraining_type
==
'classify'
:
train_transform
=
ClassificationTransform
(
image_size
)
val_transform
=
ClassificationTransform
(
image_size
,
train
=
False
)
elif
args
.
vision_pretraining_type
==
'inpaint'
:
train_transform
=
InpaintingTransform
(
image_size
,
train
=
False
)
val_transform
=
InpaintingTransform
(
image_size
,
train
=
False
)
elif
args
.
vision_pretraining_type
==
'dino'
:
train_transform
=
DinoTransform
(
image_size
,
train
=
True
)
val_transform
=
ClassificationTransform
(
image_size
,
train
=
False
)
else
:
raise
Exception
(
'{} vit pretraining type is not supported.'
.
format
(
args
.
vit_pretraining_type
))
# training dataset
train_data_path
=
data_path
[
0
]
if
len
(
data_path
)
<=
2
else
data_path
[
2
]
train_data
=
ImageFolder
(
root
=
train_data_path
,
transform
=
train_transform
,
classes_fraction
=
args
.
classes_fraction
,
data_per_class_fraction
=
args
.
data_per_class_fraction
)
train_data
=
RandomSeedDataset
(
train_data
)
# validation dataset
val_data_path
=
data_path
[
1
]
val_data
=
ImageFolder
(
root
=
val_data_path
,
transform
=
val_transform
)
val_data
=
RandomSeedDataset
(
val_data
)
return
train_data
,
val_data
Prev
1
…
9
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment