Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5021 additions
and
3804 deletions
+5021
-3804
tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
...ion_controllers/test_simple_text_generation_controller.py
+291
-175
tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py
...ration_controllers/test_vlm_text_generation_controller.py
+160
-0
tests/unit_tests/models/test_gpt_model.py
tests/unit_tests/models/test_gpt_model.py
+118
-81
tests/unit_tests/models/test_llava_model.py
tests/unit_tests/models/test_llava_model.py
+1019
-897
tests/unit_tests/models/test_multimodal_projector.py
tests/unit_tests/models/test_multimodal_projector.py
+75
-75
tests/unit_tests/models/test_radio_model.py
tests/unit_tests/models/test_radio_model.py
+61
-0
tests/unit_tests/models/test_t5_model.py
tests/unit_tests/models/test_t5_model.py
+365
-362
tests/unit_tests/pipeline_parallel/test_schedules.py
tests/unit_tests/pipeline_parallel/test_schedules.py
+387
-271
tests/unit_tests/test_model_configs.py
tests/unit_tests/test_model_configs.py
+37
-0
tests/unit_tests/test_optimizer.py
tests/unit_tests/test_optimizer.py
+162
-113
tests/unit_tests/test_optimizer_cpu_offloading.py
tests/unit_tests/test_optimizer_cpu_offloading.py
+141
-0
tests/unit_tests/test_parallel_state.py
tests/unit_tests/test_parallel_state.py
+524
-517
tests/unit_tests/test_utils.py
tests/unit_tests/test_utils.py
+272
-213
tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
...s/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
+100
-99
tests/unit_tests/transformer/moe/test_aux_loss.py
tests/unit_tests/transformer/moe/test_aux_loss.py
+144
-142
tests/unit_tests/transformer/moe/test_moe_layer.py
tests/unit_tests/transformer/moe/test_moe_layer.py
+189
-189
tests/unit_tests/transformer/moe/test_routers.py
tests/unit_tests/transformer/moe/test_routers.py
+235
-145
tests/unit_tests/transformer/moe/test_token_dispatcher.py
tests/unit_tests/transformer/moe/test_token_dispatcher.py
+361
-272
tests/unit_tests/transformer/test_attention.py
tests/unit_tests/transformer/test_attention.py
+127
-123
tests/unit_tests/transformer/test_multi_latent_attention.py
tests/unit_tests/transformer/test_multi_latent_attention.py
+253
-130
No files found.
tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
View file @
688448db
import
copy
import
os
import
random
import
string
import
time
from
collections
import
OrderedDict
from
typing
import
Dict
from
typing
import
Dict
,
List
from
unittest
import
mock
import
pytest
...
...
@@ -30,7 +31,7 @@ from tests.unit_tests.test_utilities import Utils
class
TestTextGenerationController
:
def
setup_m
eth
od
(
self
,
method
):
def
setup_mod
el
(
self
,
dtype
):
Utils
.
initialize_model_parallel
(
tensor_model_parallel_size
=
2
,
pipeline_model_parallel_size
=
2
)
...
...
@@ -58,8 +59,10 @@ class TestTextGenerationController:
inference_wrapper_config
=
InferenceWrapperConfig
(
hidden_size
=
self
.
hidden_size
,
inference_batch_times_seqlen_threshold
=-
1
,
inference_max_seq_length
=
2048
,
inference_max_requests
=
self
.
batch_size
,
fp32_residual_connection
=
False
,
params_dtype
=
torch
.
float
,
params_dtype
=
dtype
,
padded_vocab_size
=
self
.
vocab_size
,
)
...
...
@@ -75,6 +78,8 @@ class TestTextGenerationController:
Utils
.
destroy_model_parallel
()
def
test_sample_from_logits
(
self
):
self
.
setup_model
(
torch
.
float32
)
with
pytest
.
raises
(
AssertionError
)
as
aerror
:
self
.
text_generation_controller
.
sample_from_logits
(
last_token_logits
=
None
,
...
...
@@ -138,27 +143,98 @@ class TestTextGenerationController:
sampled_logits
>=
expected_min_value
),
f
"The sampled logits should all be greater than
{
expected_min_value
}
but its
{
sampled_logits
}
"
def
test_generate_all_output_tokens_static_batch
(
self
):
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
bfloat16
])
def
test_generate_all_output_tokens_static_batch
(
self
,
dtype
):
self
.
setup_model
(
dtype
)
self
.
mock_tokenizer
.
vocab_size
=
self
.
vocab_size
self
.
mock_tokenizer
.
eod
=
self
.
vocab_size
-
1
self
.
mock_tokenizer
.
detokenize
.
return_value
=
''
.
join
(
random
.
choices
(
string
.
ascii_letters
,
k
=
random
.
randint
(
4
,
10
))
self
.
mock_tokenizer
.
detokenize
.
side_effect
=
lambda
x
:
' '
.
join
(
[
''
.
join
(
random
.
choices
(
string
.
ascii_letters
,
k
=
random
.
randint
(
4
,
10
)))
for
_
in
range
(
len
(
x
))
]
)
self
.
mock_tokenizer
.
offsets
.
side_effect
=
lambda
_
,
s
:
[
i
for
i
,
c
in
enumerate
(
s
)
if
c
==
' '
]
+
[
len
(
s
)]
active_requests
:
Dict
[
int
,
InferenceRequest
]
=
OrderedDict
()
active_requests
:
Dict
[
str
,
InferenceRequest
]
=
OrderedDict
()
all_prompt_tokens
:
Dict
[
str
,
List
[
int
]]
=
OrderedDict
()
for
i
in
range
(
self
.
batch_size
):
prompt
=
"sample"
*
(
i
+
1
)
self
.
mock_tokenizer
.
tokenize
.
return_value
=
torch
.
randn
(
self
.
batch_size
,
self
.
vocab_size
).
cuda
()
prompt_tokens
=
torch
.
randint
(
low
=
0
,
high
=
self
.
vocab_size
-
1
,
size
=
(
len
(
prompt
),)
).
tolist
()
request_id
=
str
(
i
)
inference_request
=
InferenceRequest
(
request_id
=
request_id
,
prompt
=
prompt
,
inference_parameters
=
SamplingParams
(
num_tokens_to_generate
=
10
,
return_log_probs
=
True
,
return_segments
=
True
),
arrival_time
=
time
.
time
(),
prompt_tokens
=
prompt_tokens
,
status
=
Status
.
ACTIVE_BUT_NOT_GENERATING_TOKENS
,
)
active_requests
[
request_id
]
=
inference_request
all_prompt_tokens
[
request_id
]
=
copy
.
deepcopy
(
prompt_tokens
)
requests
=
self
.
text_generation_controller
.
generate_all_output_tokens_static_batch
(
active_requests
)
for
request_id
,
request
in
requests
.
items
():
assert
(
request
.
status
==
Status
.
COMPLETED
),
f
"Status should be completed but its
{
request
.
status
}
"
assert
request
.
generated_length
>
0
,
f
"Generated length should be greater than zero"
assert
request
.
generated_text
is
not
None
,
"Generated text should not be None"
assert
(
all_prompt_tokens
[
request_id
]
==
request
.
prompt_tokens
),
"Prompt tokens should not have changed during generation"
assert
len
(
request
.
segments
)
==
len
(
request
.
prompt_log_probs
)
+
len
(
request
.
generated_log_probs
),
"Segments should be returned for both prompt and generated tokens"
assert
len
(
request
.
prompt
)
+
len
(
request
.
generated_text
)
==
len
(
request
.
text
),
"Output text should include prompts and generations"
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
bfloat16
])
def
test_output_log_probs
(
self
,
dtype
):
self
.
setup_model
(
dtype
)
self
.
mock_tokenizer
.
vocab_size
=
self
.
vocab_size
self
.
mock_tokenizer
.
bos
=
0
self
.
mock_tokenizer
.
eod
=
self
.
vocab_size
-
1
self
.
mock_tokenizer
.
detokenize
.
side_effect
=
lambda
x
:
' '
.
join
(
[
''
.
join
(
random
.
choices
(
string
.
ascii_letters
,
k
=
random
.
randint
(
4
,
10
)))
for
_
in
range
(
len
(
x
))
]
)
self
.
mock_tokenizer
.
offsets
.
side_effect
=
lambda
_
,
s
:
[
i
for
i
,
c
in
enumerate
(
s
)
if
c
==
' '
]
+
[
len
(
s
)]
prompt
=
""
active_requests
:
Dict
[
int
,
InferenceRequest
]
=
OrderedDict
()
for
i
in
range
(
self
.
batch_size
):
self
.
mock_tokenizer
.
tokenize
.
return_value
=
torch
.
randn
(
self
.
batch_size
,
self
.
vocab_size
).
cuda
()
inference_request
=
InferenceRequest
(
request_id
=
i
,
prompt
=
prompt
,
inference_parameters
=
SamplingParams
(
num_tokens_to_generate
=
10
),
inference_parameters
=
SamplingParams
(
num_tokens_to_generate
=
1
,
return_log_probs
=
True
),
arrival_time
=
time
.
time
(),
prompt_tokens
=
torch
.
randint
(
low
=
0
,
high
=
self
.
vocab_size
-
1
,
size
=
(
len
(
prompt
),)
).
tolist
(),
prompt_tokens
=
[
self
.
mock_tokenizer
.
bos
],
status
=
Status
.
ACTIVE_BUT_NOT_GENERATING_TOKENS
,
)
active_requests
[
i
]
=
inference_request
...
...
@@ -173,3 +249,43 @@ class TestTextGenerationController:
),
f
"Status should be completed but its
{
request
.
status
}
"
assert
request
.
generated_length
>
0
,
f
"Generated length should be greater than zero"
assert
request
.
generated_text
is
not
None
,
"Generated text should not be None"
assert
len
(
request
.
generated_log_probs
)
==
request
.
generated_length
def
test_token_overflow
(
self
):
self
.
setup_model
(
torch
.
float32
)
self
.
mock_tokenizer
.
vocab_size
=
self
.
vocab_size
self
.
mock_tokenizer
.
bos
=
0
self
.
mock_tokenizer
.
eod
=
self
.
vocab_size
-
1
self
.
mock_tokenizer
.
detokenize
.
side_effect
=
lambda
x
:
' '
.
join
(
[
''
.
join
(
random
.
choices
(
string
.
ascii_letters
,
k
=
random
.
randint
(
4
,
10
)))
for
_
in
range
(
len
(
x
))
]
)
self
.
mock_tokenizer
.
offsets
.
side_effect
=
lambda
_
,
s
:
[
i
for
i
,
c
in
enumerate
(
s
)
if
c
==
' '
]
+
[
len
(
s
)]
prompt
=
""
active_requests
:
Dict
[
int
,
InferenceRequest
]
=
OrderedDict
()
for
i
in
range
(
self
.
batch_size
):
self
.
mock_tokenizer
.
tokenize
.
return_value
=
torch
.
randn
(
self
.
batch_size
,
self
.
vocab_size
).
cuda
()
inference_request
=
InferenceRequest
(
request_id
=
i
,
prompt
=
prompt
,
inference_parameters
=
SamplingParams
(
num_tokens_to_generate
=
4096
,
return_log_probs
=
True
),
arrival_time
=
time
.
time
(),
prompt_tokens
=
[
self
.
mock_tokenizer
.
bos
],
status
=
Status
.
ACTIVE_BUT_NOT_GENERATING_TOKENS
,
)
active_requests
[
i
]
=
inference_request
with
pytest
.
raises
(
AssertionError
):
requests
=
self
.
text_generation_controller
.
generate_all_output_tokens_static_batch
(
active_requests
)
tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py
0 → 100644
View file @
688448db
import
copy
import
os
import
random
import
string
import
time
from
argparse
import
Namespace
from
collections
import
OrderedDict
from
typing
import
Dict
from
unittest
import
mock
import
pytest
import
torch
from
megatron.core.inference.common_inference_params
import
CommonInferenceParams
from
megatron.core.inference.inference_request
import
InferenceRequest
,
Status
,
VLMInferenceRequest
from
megatron.core.inference.model_inference_wrappers.inference_wrapper_config
import
(
InferenceWrapperConfig
,
)
from
megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper
import
(
VLMInferenceWrapper
,
)
from
megatron.core.inference.text_generation_controllers.vlm_text_generation_controller
import
(
VLMTextGenerationController
,
)
from
megatron.core.models.gpt.gpt_layer_specs
import
get_gpt_layer_local_spec
from
megatron.core.models.multimodal.llava_model
import
LLaVAModel
from
megatron.core.tensor_parallel.random
import
model_parallel_cuda_manual_seed
from
megatron.core.transformer.enums
import
AttnBackend
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.legacy.model
import
Float16Module
from
tests.unit_tests.test_utilities
import
Utils
class
TestVLMTextGenerationController
:
@
pytest
.
mark
.
internal
# The model is under active development and its methods may change.
def
setup_method
(
self
,
method
):
Utils
.
initialize_model_parallel
(
1
,
1
)
model_parallel_cuda_manual_seed
(
123
)
self
.
language_hidden_size
=
64
self
.
language_num_attention_heads
=
4
self
.
language_vocab_size
=
8192
self
.
language_max_sequence_length
=
4096
self
.
img_h
=
336
self
.
img_w
=
336
language_config
=
TransformerConfig
(
num_layers
=
3
,
hidden_size
=
self
.
language_hidden_size
,
num_attention_heads
=
self
.
language_num_attention_heads
,
use_cpu_initialization
=
False
,
)
vision_config
=
TransformerConfig
(
num_layers
=
2
,
hidden_size
=
16
,
num_attention_heads
=
2
,
use_cpu_initialization
=
False
)
vision_projection_config
=
TransformerConfig
(
num_layers
=
2
,
hidden_size
=
self
.
language_hidden_size
,
ffn_hidden_size
=
32
,
num_attention_heads
=
1
,
use_cpu_initialization
=
False
,
)
language_layer_spec
=
get_gpt_layer_local_spec
()
vision_layer_spec
=
copy
.
deepcopy
(
language_layer_spec
)
vision_projection_spec
=
copy
.
deepcopy
(
language_layer_spec
.
submodules
.
mlp
.
submodules
)
language_config
.
language_model_type
=
"dummy"
vision_config
.
vision_model_type
=
"clip"
self
.
model
=
LLaVAModel
(
language_transformer_config
=
language_config
,
language_transformer_layer_spec
=
language_layer_spec
,
language_vocab_size
=
self
.
language_vocab_size
,
language_max_sequence_length
=
self
.
language_max_sequence_length
,
vision_transformer_config
=
vision_config
,
vision_transformer_layer_spec
=
vision_layer_spec
,
drop_vision_class_token
=
False
,
vision_projection_config
=
vision_projection_config
,
vision_projection_layer_spec
=
vision_projection_spec
,
img_h
=
self
.
img_h
,
img_w
=
self
.
img_w
,
patch_dim
=
14
,
).
cuda
()
self
.
image_token_index
=
self
.
model
.
image_token_index
self
.
model
=
Float16Module
(
self
.
model
,
Namespace
(
fp16
=
False
,
bf16
=
True
))
inference_wrapper_config
=
InferenceWrapperConfig
(
hidden_size
=
self
.
language_hidden_size
,
inference_batch_times_seqlen_threshold
=-
1
,
fp32_residual_connection
=
False
,
params_dtype
=
torch
.
float
,
padded_vocab_size
=
self
.
language_vocab_size
,
)
inference_wrapped_model
=
VLMInferenceWrapper
(
self
.
model
,
inference_wrapper_config
)
self
.
mock_tokenizer
=
mock
.
Mock
()
self
.
text_generation_controller
=
VLMTextGenerationController
(
inference_wrapped_model
=
inference_wrapped_model
,
tokenizer
=
self
.
mock_tokenizer
)
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
def
test_generate_all_output_tokens_static_batch
(
self
):
self
.
mock_tokenizer
.
vocab_size
=
self
.
language_vocab_size
self
.
mock_tokenizer
.
eod
=
self
.
language_vocab_size
-
1
self
.
mock_tokenizer
.
detokenize
.
return_value
=
''
.
join
(
random
.
choices
(
string
.
ascii_letters
,
k
=
random
.
randint
(
4
,
10
))
)
batch_size
:
int
=
1
num_img_embeddings_per_tile
:
int
=
576
imgs
:
torch
.
Tensor
=
torch
.
randn
(
1
,
3
,
self
.
img_h
,
self
.
img_w
).
cuda
()
num_tiles
:
torch
.
Tensor
=
torch
.
Tensor
([
1
]).
int
()
decoder_seq_length
:
int
=
self
.
language_max_sequence_length
active_requests
:
Dict
[
str
,
InferenceRequest
]
=
OrderedDict
()
all_prompt_tokens
:
Dict
[
str
,
List
[
int
]]
=
OrderedDict
()
for
i
in
range
(
batch_size
):
prompt
=
"sample"
*
(
i
+
1
)
self
.
mock_tokenizer
.
tokenize
.
return_value
=
torch
.
randn
(
batch_size
,
self
.
language_vocab_size
).
cuda
()
prompt_tokens
=
torch
.
randint
(
low
=
0
,
high
=
self
.
language_vocab_size
-
1
,
size
=
(
len
(
prompt
),)
).
tolist
()
prompt_tokens
[
3
]
=
self
.
image_token_index
request_id
=
str
(
i
)
inference_request
=
VLMInferenceRequest
(
request_id
=
request_id
,
prompt
=
prompt
,
inference_parameters
=
CommonInferenceParams
(
num_tokens_to_generate
=
10
),
arrival_time
=
time
.
time
(),
prompt_tokens
=
prompt_tokens
,
num_img_embeddings_per_tile
=
num_img_embeddings_per_tile
,
imgs
=
imgs
,
num_tiles
=
num_tiles
,
decoder_seq_length
=
decoder_seq_length
,
status
=
Status
.
ACTIVE_BUT_NOT_GENERATING_TOKENS
,
)
active_requests
[
request_id
]
=
inference_request
all_prompt_tokens
[
request_id
]
=
copy
.
deepcopy
(
prompt_tokens
)
requests
=
self
.
text_generation_controller
.
generate_all_output_tokens_static_batch
(
active_requests
)
for
request_id
,
request
in
requests
.
items
():
assert
(
request
.
status
==
Status
.
COMPLETED
),
f
"Status should be completed but its
{
request
.
status
}
"
assert
request
.
generated_length
>
0
,
f
"Generated length should be greater than zero"
assert
request
.
generated_text
is
not
None
,
"Generated text should not be None"
assert
(
all_prompt_tokens
[
request_id
]
==
request
.
prompt_tokens
),
"Prompt tokens should not have changed during generation"
tests/unit_tests/models/test_gpt_model.py
View file @
688448db
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
inspect
import
os
import
pytest
import
torch
from
megatron.core.models.gpt.gpt_layer_specs
import
get_gpt_layer_with_transformer_engine_spec
from
megatron.core.models.gpt.gpt_layer_specs
import
(
get_gpt_layer_with_transformer_engine_spec
,
get_mlp_module_spec
,
)
from
megatron.core.models.gpt.gpt_model
import
GPTModel
from
megatron.core.tensor_parallel.random
import
model_parallel_cuda_manual_seed
from
megatron.core.transformer.transformer_config
import
TransformerConfig
...
...
@@ -59,7 +63,7 @@ class TestGPTModel:
@
pytest
.
mark
.
internal
def
test_post_process_forward
(
self
):
config
:
TransformerConfig
=
self
.
gpt_model
.
config
_
=
self
.
gpt_model
.
config
sequence_length
=
self
.
gpt_model
.
max_sequence_length
micro_batch_size
=
2
...
...
@@ -79,3 +83,36 @@ class TestGPTModel:
assert
logits
.
shape
[
0
]
==
micro_batch_size
assert
logits
.
shape
[
1
]
==
sequence_length
assert
logits
.
shape
[
2
]
==
self
.
gpt_model
.
vocab_size
def
test_get_mlp_module_spec_interface
():
# Get the function signature
sig
=
inspect
.
signature
(
get_mlp_module_spec
)
# Define the expected signature
expected_params
=
{
"use_te"
:
inspect
.
Parameter
.
POSITIONAL_OR_KEYWORD
,
"num_experts"
:
inspect
.
Parameter
.
POSITIONAL_OR_KEYWORD
,
"moe_grouped_gemm"
:
inspect
.
Parameter
.
POSITIONAL_OR_KEYWORD
,
"fp8"
:
inspect
.
Parameter
.
POSITIONAL_OR_KEYWORD
,
"moe_use_legacy_grouped_gemm"
:
inspect
.
Parameter
.
POSITIONAL_OR_KEYWORD
,
}
expected_defaults
=
{
"use_te"
:
True
,
"num_experts"
:
None
,
"moe_grouped_gemm"
:
False
,
"fp8"
:
None
,
"moe_use_legacy_grouped_gemm"
:
False
,
}
# Check parameter kinds
for
param_name
,
param
in
sig
.
parameters
.
items
():
assert
param_name
in
expected_params
.
keys
(),
f
"Unexpected parameter:
{
param_name
}
"
assert
param
.
kind
is
expected_params
[
param_name
],
f
"Wrong kind for parameter:
{
param_name
}
"
# Check default values
defaults
=
{
k
:
v
.
default
for
k
,
v
in
sig
.
parameters
.
items
()
if
v
.
default
is
not
inspect
.
Parameter
.
empty
}
assert
defaults
==
expected_defaults
,
"Default values do not match the expected ones."
tests/unit_tests/models/test_llava_model.py
View file @
688448db
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from
contextlib
import
nullcontext
from
copy
import
deepcopy
from
types
import
SimpleNamespace
...
...
@@ -8,6 +9,7 @@ import torch
from
megatron.core
import
InferenceParams
from
megatron.core
import
parallel_state
as
ps
from
megatron.core.models.gpt.gpt_layer_specs
import
get_gpt_layer_with_transformer_engine_spec
from
megatron.core.models.multimodal
import
context_parallel
from
megatron.core.models.multimodal.llava_model
import
LLaVAModel
from
megatron.core.models.vision.vit_layer_specs
import
get_vit_layer_with_transformer_engine_spec
from
megatron.core.packed_seq_params
import
PackedSeqParams
...
...
@@ -49,6 +51,7 @@ class TestLLaVAModel:
vision_layer_spec
=
deepcopy
(
language_layer_spec
)
vision_projection_spec
=
deepcopy
(
language_layer_spec
.
submodules
.
mlp
.
submodules
)
language_config
.
language_model_type
=
"dummy"
vision_config
.
vision_model_type
=
"clip"
self
.
model
=
LLaVAModel
(
language_transformer_config
=
language_config
,
...
...
@@ -131,7 +134,6 @@ class TestLLaVAModel:
use_inference_kv_cache
=
False
inference_params
=
None
image_token_mask
=
None
embeddings
,
labels
,
loss_mask
=
self
.
model
.
_preprocess_data
(
image_embeddings
,
...
...
@@ -143,7 +145,6 @@ class TestLLaVAModel:
inference_params
,
image_token_index
,
num_image_tiles
,
image_token_mask
,
)
img_seq_len
=
577
...
...
@@ -320,21 +321,27 @@ class TestLLaVAModel:
# Try with labels and PackedSeqParams. Only micro batch size 1 is supported in this mode.
packed_seq_params
=
PackedSeqParams
(
qkv_format
=
"thd"
,
cu_seqlens_q
=
[
0
,
512
,
1024
,
1600
],
# Just example values.
cu_seqlens_kv
=
[
0
,
512
,
1024
,
1600
],
max_seqlen_q
=
[
1600
],
max_seqlen_kv
=
[
1600
],
cu_seqlens_q
=
torch
.
tensor
(
[
0
,
512
,
1024
,
1600
],
dtype
=
torch
.
int32
).
cuda
(),
# Just example values.
cu_seqlens_kv
=
torch
.
tensor
([
0
,
512
,
1024
,
1600
],
dtype
=
torch
.
int32
).
cuda
(),
max_seqlen_q
=
torch
.
tensor
(
1600
,
dtype
=
torch
.
int32
).
cuda
(),
max_seqlen_kv
=
torch
.
tensor
(
1600
,
dtype
=
torch
.
int32
).
cuda
(),
)
# NOTE: Packing is only supported with BF16. Use BF16 here and switch back to default.
self
.
model
.
to
(
torch
.
bfloat16
)
loss
,
new_loss_mask
=
self
.
model
.
forward
(
img
[:
1
]
,
img
[:
1
]
.
to
(
torch
.
bfloat16
),
input_ids
[:
1
],
position_ids
[:
1
],
attention_mask
,
labels
[:
1
],
loss_mask
[:
1
],
num_image_tiles
=
num_image_tiles
[:
1
],
packed_seq_params
=
packed_seq_params
,
)
self
.
model
.
to
(
torch
.
float32
)
# 1600 = 577 (img_seq_len) + 1024 (text tokens in the first sample) - 1 (image token).
assert
loss
.
shape
==
new_loss_mask
.
shape
==
torch
.
Size
((
1
,
1600
))
...
...
@@ -391,6 +398,49 @@ class TestLLaVAModel:
==
torch
.
Size
((
max_seq_len
,
5
,
self
.
language_num_attention_heads
,
16
))
)
@
pytest
.
mark
.
internal
def
test_forward_fsdp
(
self
):
"""Test FSDP workaround for text-only data.
FSDP can hang with text-only data. As a workaround, we run the vision model with a dummy image,
but then effectively discard the image embeddings.
"""
self
.
model
.
cuda
()
# Dummy image for the FSDP workaround but not image tiles.
img
=
torch
.
zeros
((
1
,
3
,
336
,
336
)).
cuda
()
num_image_tiles
=
torch
.
tensor
([],
dtype
=
torch
.
int
).
cuda
()
# No image tag in the input ids (text-only sample).
image_token_index
=
self
.
model
.
image_token_index
input_ids
=
torch
.
arange
(
1024
,
device
=
"cuda"
).
unsqueeze
(
0
)
assert
(
torch
.
sum
(
input_ids
==
image_token_index
)
==
0
),
"expected no image tag in the input ids"
position_ids
=
torch
.
arange
(
1024
,
device
=
"cuda"
).
unsqueeze
(
0
)
loss_mask
=
torch
.
ones
((
1
,
1024
),
device
=
"cuda"
)
attention_mask
=
None
# Causal.
labels
=
torch
.
arange
(
1
,
1025
,
device
=
"cuda"
).
unsqueeze
(
0
)
# Mock the FSDP attribute.
self
.
model
.
vision_model
.
_is_fsdp_managed_module
=
True
loss
,
new_loss_mask
=
self
.
model
.
forward
(
img
,
input_ids
,
position_ids
,
attention_mask
,
labels
,
loss_mask
,
num_image_tiles
=
num_image_tiles
,
)
self
.
model
.
vision_model
.
_is_fsdp_managed_module
=
False
assert
loss
.
shape
==
new_loss_mask
.
shape
==
torch
.
Size
((
1
,
1024
))
@
pytest
.
mark
.
internal
def
test_save_load
(
self
,
tmp_path
):
path
=
tmp_path
/
"model.pt"
...
...
@@ -436,6 +486,7 @@ class TestLLaVAModelSigLIP:
vision_layer_spec
=
deepcopy
(
language_layer_spec
)
vision_projection_spec
=
deepcopy
(
language_layer_spec
.
submodules
.
mlp
.
submodules
)
language_config
.
language_model_type
=
"dummy"
vision_config
.
vision_model_type
=
"siglip"
self
.
model
=
LLaVAModel
(
language_transformer_config
=
language_config
,
...
...
@@ -482,19 +533,18 @@ def create_test_args(cp_size, sequence_parallel):
class
TestLLaVAModelTokenParallel
:
def
init_llava_model
(
self
):
self
.
language_hidden_size
=
64
self
.
language_num_attention_heads
=
16
def
_
init_llava_model
(
self
,
cp_size
,
tp_size
,
sequence_parallel
):
language_hidden_size
=
64
language_num_attention_heads
=
16
language_config
=
TransformerConfig
(
num_layers
=
3
,
hidden_size
=
self
.
language_hidden_size
,
num_attention_heads
=
self
.
language_num_attention_heads
,
hidden_size
=
language_hidden_size
,
num_attention_heads
=
language_num_attention_heads
,
use_cpu_initialization
=
False
,
tensor_model_parallel_size
=
self
.
tp_size
,
sequence_parallel
=
self
.
sequence_parallel
,
context_parallel_size
=
1
,
# Init with CP=1 until CI catches up to TEv1.10
# context_parallel_size=self.cp_size,
tensor_model_parallel_size
=
tp_size
,
sequence_parallel
=
sequence_parallel
,
context_parallel_size
=
cp_size
,
)
# SP and CP are not yet supported for the Vision Backbone
vision_config
=
TransformerConfig
(
...
...
@@ -502,17 +552,17 @@ class TestLLaVAModelTokenParallel:
hidden_size
=
16
,
num_attention_heads
=
8
,
use_cpu_initialization
=
False
,
tensor_model_parallel_size
=
self
.
tp_size
,
tensor_model_parallel_size
=
tp_size
,
sequence_parallel
=
False
,
context_parallel_size
=
1
,
)
vision_projection_config
=
TransformerConfig
(
num_layers
=
2
,
hidden_size
=
self
.
language_hidden_size
,
ffn_hidden_size
=
1
024
,
hidden_size
=
language_hidden_size
,
ffn_hidden_size
=
1
28
,
num_attention_heads
=
8
,
use_cpu_initialization
=
False
,
tensor_model_parallel_size
=
self
.
tp_size
,
tensor_model_parallel_size
=
tp_size
,
sequence_parallel
=
False
,
context_parallel_size
=
1
,
)
...
...
@@ -537,8 +587,9 @@ class TestLLaVAModelTokenParallel:
vision_layer_spec
=
deepcopy
(
language_layer_spec
)
vision_projection_spec
=
deepcopy
(
language_layer_spec
.
submodules
.
mlp
.
submodules
)
language_config
.
language_model_type
=
"dummy"
vision_config
.
vision_model_type
=
"clip"
self
.
model
=
LLaVAModel
(
model
=
LLaVAModel
(
language_transformer_config
=
language_config
,
language_transformer_layer_spec
=
language_layer_spec
,
language_vocab_size
=
8192
,
...
...
@@ -553,7 +604,9 @@ class TestLLaVAModelTokenParallel:
patch_dim
=
14
,
)
@
pytest
.
mark
.
internal
# The model is under active development and its methods may change.
return
model
@
pytest
.
mark
.
internal
def
setup_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
...
...
@@ -563,31 +616,46 @@ class TestLLaVAModelTokenParallel:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
parametrize
(
"cp_size,tp_size,sequence_parallel"
,
[(
1
,
8
,
True
),
(
2
,
4
,
False
),
(
2
,
4
,
True
)]
"cp_size,tp_size,sequence_parallel,padding"
,
[(
1
,
8
,
True
,
True
),
(
2
,
4
,
False
,
True
),
(
2
,
4
,
True
,
False
),
(
2
,
4
,
True
,
True
)],
)
def
test_process_embedding_token_parallel
(
self
,
cp_size
,
tp_size
,
sequence_parallel
):
self
.
cp_size
=
cp_size
self
.
tp_size
=
tp_size
self
.
sequence_parallel
=
sequence_parallel
def
test_process_embedding_token_parallel
(
self
,
cp_size
,
tp_size
,
sequence_parallel
,
padding
):
"""Test _process_embedding_token_parallel.
Note: This test requires TE version >= 1.10.0 to run properly.
"""
Utils
.
initialize_model_parallel
(
tensor_model_parallel_size
=
self
.
tp_size
,
context_parallel_size
=
self
.
cp_size
tensor_model_parallel_size
=
tp_size
,
context_parallel_size
=
cp_size
)
model_parallel_cuda_manual_seed
(
123
)
self
.
init_llava_model
()
self
.
model
.
cuda
()
# Setting CP size for LLM here as model init is done with CP=1 to
# avoid TE version check until CI catches up to TEv1.10
if
self
.
cp_size
>
1
:
self
.
model
.
context_parallel_lm
=
self
.
cp_size
# TE version must be at least 1.10.0 if using context parallelism. Exit otherwise.
ctx
=
(
nullcontext
()
if
(
is_te_min_version
(
"1.10.0"
)
or
cp_size
<=
1
)
else
pytest
.
raises
(
AssertionError
)
)
model
=
None
with
ctx
:
model
=
self
.
_init_llava_model
(
cp_size
,
tp_size
,
sequence_parallel
)
args
=
create_test_args
(
self
.
cp_size
,
self
.
sequence_parallel
)
if
model
is
None
:
return
model
.
cuda
()
args
=
create_test_args
(
cp_size
,
sequence_parallel
)
set_args
(
args
)
batch_size
=
2
if
padding
:
combined_valid_seqlen
=
2049
combined_padded_seqlen
=
2056
if
self
.
cp_size
>
1
:
combined_padded_seqlen
=
2064
else
:
combined_valid_seqlen
=
2048
combined_padded_seqlen
=
2048
if
cp_size
>
1
:
combined_embeddings
=
torch
.
ones
(
[
batch_size
,
combined_padded_seqlen
,
4096
],
device
=
'cuda'
,
dtype
=
torch
.
bfloat16
)
# [B, S, H]
...
...
@@ -617,6 +685,20 @@ class TestLLaVAModelTokenParallel:
device
=
combined_embeddings
.
device
,
)
qkv_format
=
'sbhd'
# Default format when not using padding
if
cp_size
>
1
and
padding
:
# Reshape from [B,S] to [1,T]
combined_embeddings
=
(
combined_embeddings
.
contiguous
()
.
view
(
combined_embeddings
.
shape
[
0
]
*
combined_embeddings
.
shape
[
1
],
-
1
)
.
unsqueeze
(
0
)
)
new_labels
=
new_labels
.
view
(
new_labels
.
shape
[
0
]
*
new_labels
.
shape
[
1
]).
unsqueeze
(
0
)
new_loss_mask
=
new_loss_mask
.
view
(
new_loss_mask
.
shape
[
0
]
*
new_loss_mask
.
shape
[
1
]
).
unsqueeze
(
0
)
qkv_format
=
'thd'
packed_seq_params
=
PackedSeqParams
(
cu_seqlens_q
=
cu_seqlens
,
cu_seqlens_kv
=
cu_seqlens
,
...
...
@@ -624,41 +706,43 @@ class TestLLaVAModelTokenParallel:
cu_seqlens_kv_padded
=
cu_seqlens_padded
,
max_seqlen_q
=
combined_padded_seqlen
,
max_seqlen_kv
=
combined_padded_seqlen
,
qkv_format
=
'thd'
,
qkv_format
=
qkv_format
,
)
combined_embeddings
,
new_labels
,
new_loss_mask
,
packed_seq_params
=
(
self
.
model
.
_process_embedding_token_parallel
(
model
.
_process_embedding_token_parallel
(
combined_embeddings
,
new_labels
,
new_loss_mask
,
packed_seq_params
)
)
# Calculate the expected padded seq length
if
self
.
cp_size
>
1
and
self
.
sequence_parallel
:
padding_factor
=
self
.
tp_size
*
self
.
cp_size
*
2
elif
self
.
cp_size
>
1
:
padding_factor
=
self
.
cp_size
*
2
elif
self
.
sequence_parallel
:
padding_factor
=
self
.
tp_size
padded_seq_len
=
int
(
(
combined_padded_seqlen
+
(
padding_factor
-
1
))
//
padding_factor
*
padding_factor
)
# Check if output shape is as expected
if
self
.
cp_size
>
1
and
self
.
sequence_parallel
:
if
cp_size
>
1
and
sequence_parallel
:
if
padding
:
# THD format
assert
combined_embeddings
.
shape
[
0
]
==
batch_size
*
(
padded_seq
_
len
/
(
self
.
tp_size
*
self
.
cp_size
)
combined_
padded_seqlen
/
(
tp_size
*
cp_size
)
)
assert
combined_embeddings
.
shape
[
1
]
==
1
elif
self
.
cp_size
>
1
:
else
:
# SBHD format
assert
combined_embeddings
.
shape
[
0
]
==
(
combined_padded_seqlen
/
(
tp_size
*
cp_size
)
)
assert
combined_embeddings
.
shape
[
1
]
==
batch_size
elif
cp_size
>
1
:
if
padding
:
# THD format
assert
combined_embeddings
.
shape
[
0
]
==
batch_size
*
(
padded_seq_len
/
self
.
cp_size
)
assert
combined_embeddings
.
shape
[
0
]
==
batch_size
*
(
combined_padded_seqlen
/
cp_size
)
assert
combined_embeddings
.
shape
[
1
]
==
1
else
:
# SBHD format
assert
combined_embeddings
.
shape
[
0
]
==
padded_seq_len
/
self
.
tp_size
assert
combined_embeddings
.
shape
[
0
]
==
(
combined_padded_seqlen
/
cp_size
)
assert
combined_embeddings
.
shape
[
1
]
==
batch_size
else
:
# SBHD format
assert
combined_embeddings
.
shape
[
0
]
==
combined_padded_seqlen
/
tp_size
assert
combined_embeddings
.
shape
[
1
]
==
batch_size
...
...
@@ -690,7 +774,7 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
model_parallel_cuda_manual_seed
(
123
)
language_config
=
TransformerConfig
(
num_layers
=
8
,
num_layers
=
12
,
hidden_size
=
language_hidden_size
,
num_attention_heads
=
language_num_attention_heads
,
use_cpu_initialization
=
False
,
...
...
@@ -718,6 +802,7 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
vision_layer_spec
=
get_vit_layer_with_transformer_engine_spec
()
vision_projection_spec
=
deepcopy
(
language_layer_spec
.
submodules
.
mlp
.
submodules
)
language_config
.
language_model_type
=
"dummy"
vision_config
.
vision_model_type
=
"clip"
non_parallel_model
=
LLaVAModel
(
language_transformer_config
=
language_config
,
...
...
@@ -762,7 +847,7 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
add_decoder
=
False
if
(
pp_rank
==
0
and
epp
==
1
)
else
True
language_config
=
TransformerConfig
(
num_layers
=
8
,
num_layers
=
12
,
hidden_size
=
language_hidden_size
,
num_attention_heads
=
language_num_attention_heads
,
use_cpu_initialization
=
False
,
...
...
@@ -790,6 +875,7 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
vision_layer_spec
=
get_vit_layer_with_transformer_engine_spec
()
vision_projection_spec
=
deepcopy
(
vision_layer_spec
.
submodules
.
mlp
.
submodules
)
language_config
.
language_model_type
=
"dummy"
vision_config
.
vision_model_type
=
"clip"
model
=
LLaVAModel
(
language_transformer_config
=
language_config
,
...
...
@@ -895,3 +981,39 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
Utils
.
destroy_model_parallel
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
parametrize
(
"cp_size, tp_size, has_sp, seq_len, expected_padding"
,
[(
1
,
1
,
False
,
99
,
0
),
(
2
,
2
,
True
,
99
,
5
),
(
2
,
2
,
False
,
99
,
1
)],
)
def
test_get_padding
(
cp_size
,
tp_size
,
has_sp
,
seq_len
,
expected_padding
):
"""Test calculating padding for context parallel."""
padding
=
context_parallel
.
get_padding
(
seq_len
,
cp_size
,
tp_size
,
has_sp
)
assert
padding
==
expected_padding
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
parametrize
(
"tokens, img_seq_len, padding_needed, cp_size, expected_seq_len"
,
[(
torch
.
ones
((
1
,
100
)),
100
,
0
,
2
,
200
),
(
torch
.
ones
((
1
,
100
)),
128
,
1
,
2
,
227
)],
)
def
test_get_packed_seq_params
(
tokens
,
img_seq_len
,
padding_needed
,
cp_size
,
expected_seq_len
):
"""Test creating PackedSeqParams for context parallel."""
packed_seq_params
=
context_parallel
.
get_packed_seq_params
(
tokens
,
img_seq_len
,
padding_needed
,
cp_size
)
assert
torch
.
equal
(
packed_seq_params
.
cu_seqlens_q
,
torch
.
tensor
([
0
,
expected_seq_len
],
dtype
=
torch
.
int32
)
)
if
padding_needed
>
0
:
padded_seq_len
=
tokens
.
shape
[
1
]
+
img_seq_len
assert
torch
.
equal
(
packed_seq_params
.
cu_seqlens_q_padded
,
torch
.
tensor
([
0
,
padded_seq_len
],
dtype
=
torch
.
int32
),
)
assert
packed_seq_params
.
max_seqlen_q
==
padded_seq_len
tests/unit_tests/models/test_multimodal_projector.py
View file @
688448db
...
...
@@ -3,7 +3,7 @@
import
pytest
import
torch
from
megatron.core.models.gpt.gpt_layer_specs
import
_
get_mlp_module_spec
from
megatron.core.models.gpt.gpt_layer_specs
import
get_mlp_module_spec
from
megatron.core.models.vision.multimodal_projector
import
MultimodalProjector
from
megatron.core.tensor_parallel.layers
import
ColumnParallelLinear
from
megatron.core.tensor_parallel.random
import
model_parallel_cuda_manual_seed
...
...
@@ -20,7 +20,7 @@ class TestMultimodalProjector:
transformer_config
=
TransformerConfig
(
num_layers
=
1
,
hidden_size
=
64
,
num_attention_heads
=
4
,
use_cpu_initialization
=
True
)
mlp_layer_spec
=
_
get_mlp_module_spec
().
submodules
mlp_layer_spec
=
get_mlp_module_spec
().
submodules
affine_layer_spec
=
MLPSubmodules
(
linear_fc1
=
ColumnParallelLinear
,
linear_fc2
=
None
)
self
.
mlp
=
MultimodalProjector
(
...
...
tests/unit_tests/models/test_radio_model.py
0 → 100644
View file @
688448db
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
import
pytest
import
torch
from
megatron.core.models.gpt.gpt_layer_specs
import
get_gpt_layer_with_transformer_engine_spec
from
megatron.core.models.vision.radio
import
RADIOViTModel
from
megatron.core.tensor_parallel.random
import
model_parallel_cuda_manual_seed
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
tests.unit_tests.test_utilities
import
Utils
class
TestRADIOViTModel
:
"""Test RADIO ViT model."""
def
setup_method
(
self
,
method
):
Utils
.
initialize_model_parallel
(
1
,
1
)
model_parallel_cuda_manual_seed
(
123
)
transformer_config
=
TransformerConfig
(
num_layers
=
2
,
hidden_size
=
64
,
num_attention_heads
=
4
,
use_cpu_initialization
=
True
)
transformer_layer_spec
=
get_gpt_layer_with_transformer_engine_spec
()
self
.
model
=
RADIOViTModel
(
transformer_config
,
transformer_layer_spec
,
img_h
=
224
,
img_w
=
224
,
patch_dim
=
14
,
add_class_token
=
False
,
)
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
def
test_constructor
(
self
):
assert
isinstance
(
self
.
model
,
RADIOViTModel
)
num_weights
=
sum
([
p
.
numel
()
for
p
in
self
.
model
.
parameters
()])
assert
num_weights
==
1501824
def
test_set_input_tensor
(
self
):
# [s, b, h] expected to the transformer.
expected_shape
=
(
256
,
2
,
64
)
input_tensor
=
torch
.
zeros
(
expected_shape
)
self
.
model
.
set_input_tensor
(
input_tensor
)
assert
self
.
model
.
decoder
.
input_tensor
.
shape
==
torch
.
Size
(
expected_shape
)
def
test_forward
(
self
):
self
.
model
.
cuda
()
img
=
torch
.
zeros
((
2
,
3
,
224
,
224
)).
cuda
()
out
=
self
.
model
.
forward
(
img
)
assert
out
.
shape
==
torch
.
Size
([
2
,
256
,
64
])
def
test_save_load
(
self
,
tmp_path
):
path
=
tmp_path
/
"model.pt"
torch
.
save
(
self
.
model
.
state_dict
(),
path
)
self
.
model
.
load_state_dict
(
torch
.
load
(
path
))
tests/unit_tests/models/test_t5_model.py
View file @
688448db
...
...
@@ -107,6 +107,7 @@ class TestT5Model:
assert
self
.
t5_model
.
encoder_hidden_state
.
shape
[
1
]
==
micro_batch_size
assert
self
.
t5_model
.
encoder_hidden_state
.
shape
[
2
]
==
config
.
hidden_size
@
pytest
.
mark
.
flaky_in_dev
def
test_post_process_forward
(
self
):
config
:
TransformerConfig
=
self
.
t5_model
.
config
sequence_length
=
self
.
t5_model
.
max_sequence_length
...
...
@@ -156,6 +157,7 @@ class TestT5Model:
assert
encoder_hidden_states
.
shape
[
1
]
==
micro_batch_size
assert
encoder_hidden_states
.
shape
[
2
]
==
config
.
hidden_size
@
pytest
.
mark
.
flaky_in_dev
def
test_forward_output_encoder_hidden_only
(
self
):
config
:
TransformerConfig
=
self
.
t5_model
.
config
sequence_length
=
self
.
t5_model
.
max_sequence_length
...
...
@@ -191,6 +193,7 @@ class TestT5Model:
assert
encoder_hidden_states
.
shape
[
1
]
==
micro_batch_size
assert
encoder_hidden_states
.
shape
[
2
]
==
config
.
hidden_size
@
pytest
.
mark
.
flaky_in_dev
def
test_forward_with_encoder_hidden_states
(
self
):
config
:
TransformerConfig
=
self
.
t5_model
.
config
sequence_length
=
self
.
t5_model
.
max_sequence_length
...
...
tests/unit_tests/pipeline_parallel/test_schedules.py
View file @
688448db
...
...
@@ -269,3 +269,119 @@ def test_forward_backward_func_with_interleaving(mocker):
assert
i
[
'loss_reduced'
]
==
j
[
'loss_reduced'
]
Utils
.
destroy_model_parallel
()
def
test_forward_backward_func_with_uneven_interleaving
(
mocker
):
from
megatron.core.enums
import
ModelType
from
megatron.core.pipeline_parallel
import
get_forward_backward_func
Utils
.
initialize_model_parallel
(
tensor_model_parallel_size
=
1
,
pipeline_model_parallel_size
=
4
,
virtual_pipeline_model_parallel_size
=
2
,
)
def
forward_step_func
(
data_iterator
,
model
):
import
os
rank
=
int
(
os
.
environ
[
'LOCAL_RANK'
])
def
loss_func
(
output_tensor
):
return
rank
,
{
'loss_reduced'
:
rank
}
return
torch
.
rand
(
512
,
8
,
256
).
cuda
(),
loss_func
model_a
=
torch
.
nn
.
Linear
(
4
,
1
)
model_b
=
torch
.
nn
.
Linear
(
8
,
1
)
def
set_input_tensor
(
input_tensor
):
return
None
model_a
.
set_input_tensor
=
set_input_tensor
model_b
.
set_input_tensor
=
set_input_tensor
forward_backward_func
=
get_forward_backward_func
()
assert
(
schedule
.
get_forward_backward_func
()
==
schedule
.
forward_backward_pipelining_with_interleaving
)
sequence_length
=
512
micro_batch_size
=
8
hidden_size
=
256
config
=
ModelParallelConfig
(
pipeline_model_parallel_size
=
4
,
sequence_parallel
=
False
,
pipeline_dtype
=
torch
.
float
)
config
.
hidden_size
=
hidden_size
model_a
.
config
=
config
model_b
.
config
=
config
mocker
.
patch
(
"megatron.core.pipeline_parallel.schedules.custom_backward"
,
return_value
=
2
)
with
pytest
.
raises
(
RuntimeError
):
model_a
.
model_type
=
ModelType
.
encoder_and_decoder
model_b
.
model_type
=
ModelType
.
encoder_and_decoder
forward_backward_func
(
forward_step_func
=
forward_step_func
,
data_iterator
=
[
range
(
0
,
100
)],
model
=
[
model_a
,
model_b
],
num_microbatches
=
micro_batch_size
,
seq_length
=
sequence_length
,
micro_batch_size
=
micro_batch_size
,
decoder_seq_length
=
sequence_length
,
forward_only
=
True
,
)
with
pytest
.
raises
(
RuntimeError
):
model_a
.
model_type
=
ModelType
.
encoder_or_decoder
model_b
.
model_type
=
ModelType
.
encoder_or_decoder
forward_backward_func
(
forward_step_func
=
forward_step_func
,
data_iterator
=
[
range
(
0
,
100
)],
model
=
[
model_a
,
model_b
],
num_microbatches
=
micro_batch_size
,
seq_length
=
sequence_length
,
micro_batch_size
=
micro_batch_size
,
decoder_seq_length
=
256
,
forward_only
=
True
,
)
with
pytest
.
raises
(
RuntimeError
):
model_a
.
model_type
=
ModelType
.
encoder_or_decoder
model_b
.
model_type
=
ModelType
.
encoder_or_decoder
forward_backward_func
(
forward_step_func
=
forward_step_func
,
data_iterator
=
[
range
(
0
,
100
)],
model
=
[
model_a
,
model_b
],
num_microbatches
=
7
,
seq_length
=
sequence_length
,
micro_batch_size
=
micro_batch_size
,
decoder_seq_length
=
512
,
forward_only
=
True
,
)
model_a
.
model_type
=
ModelType
.
encoder_or_decoder
model_b
.
model_type
=
ModelType
.
encoder_or_decoder
losses_reduced
=
forward_backward_func
(
forward_step_func
=
forward_step_func
,
data_iterator
=
[
range
(
0
,
100
),
range
(
0
,
100
)],
model
=
[
model_a
,
model_b
],
num_microbatches
=
micro_batch_size
,
seq_length
=
sequence_length
,
micro_batch_size
=
micro_batch_size
,
decoder_seq_length
=
sequence_length
,
forward_only
=
True
,
)
loss_reduced_expected
=
[
{
'loss_reduced'
:
rank
},
{
'loss_reduced'
:
rank
},
{
'loss_reduced'
:
rank
},
{
'loss_reduced'
:
rank
},
]
for
i
,
j
in
zip
(
losses_reduced
,
loss_reduced_expected
):
print
(
losses_reduced
)
assert
i
[
'loss_reduced'
]
==
j
[
'loss_reduced'
]
Utils
.
destroy_model_parallel
()
tests/unit_tests/test_model_configs.py
0 → 100644
View file @
688448db
import
pathlib
import
pytest
import
yaml
YAML_DIR
=
pathlib
.
Path
(
__file__
).
parent
/
".."
/
"functional_tests"
/
"test_cases"
def
get_yaml_files
(
directory
):
"""Retrieve all YAML files from the specified directory."""
return
list
([
file
for
file
in
directory
.
rglob
(
"*.yaml"
)
if
file
is
not
None
])
def
load_yaml
(
file_path
):
"""Load a YAML file and return its content as a Python dictionary."""
with
open
(
file_path
,
"r"
)
as
f
:
return
yaml
.
safe_load
(
f
)
@
pytest
.
mark
.
parametrize
(
"metric"
,
[
"--log-memory-to-tensorboard"
,
"--log-num-zeros-in-grad"
,
"--log-timers-to-tensorboard"
],
)
@
pytest
.
mark
.
parametrize
(
"yaml_file"
,
get_yaml_files
(
YAML_DIR
))
def
test_model_config_tracks_memory
(
yaml_file
,
metric
):
"""Test if each YAML file contains the required record."""
print
(
"gpt3-nemo"
in
str
(
yaml_file
)
or
"ckpt_converter"
in
str
(
yaml_file
))
if
"gpt3-nemo"
in
str
(
yaml_file
)
or
"ckpt_converter"
in
str
(
yaml_file
):
pytest
.
skip
(
"Skipping for gpt-nemo"
)
model_config
=
load_yaml
(
yaml_file
)
assert
(
"MODEL_ARGS"
in
model_config
and
metric
in
model_config
[
"MODEL_ARGS"
]
and
model_config
[
"MODEL_ARGS"
][
metric
]
is
True
),
f
"Please add argument `
{
metric
}
` to `
{
yaml_file
.
parent
.
name
}
/model_config.yaml` that its metric gets tracked."
tests/unit_tests/test_optimizer.py
View file @
688448db
import
os
import
pytest
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.optim
import
SGD
,
Adam
from
megatron.core.optimizer
import
ChainedOptimizer
from
megatron.core.distributed
import
DistributedDataParallel
,
DistributedDataParallelConfig
from
megatron.core.optimizer
import
ChainedOptimizer
,
OptimizerConfig
,
get_megatron_optimizer
from
megatron.core.transformer
import
TransformerConfig
from
tests.unit_tests.test_utilities
import
Utils
from
tests.unit_tests.test_utils
import
_deinit_distributed
,
_init_distributed
class
Net
(
nn
.
Module
):
...
...
@@ -111,3 +118,45 @@ def test_precision_aware_fused_adam():
bytes_2
=
p_2
.
data
.
view
(
torch
.
uint8
)
# Make sure bit-wise matched
assert
torch
.
all
(
bytes_1
==
bytes_2
)
@
pytest
.
mark
.
parametrize
(
"use_distributed_optimizer"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"precision"
,
[
'bf16'
,
'fp32'
])
def
test_optim_sharded_state_dict
(
use_distributed_optimizer
:
bool
,
precision
:
str
):
world
=
int
(
os
.
getenv
(
'WORLD_SIZE'
,
'1'
))
rank
=
int
(
os
.
getenv
(
'RANK'
,
'0'
))
# Setup: distributed, model, mock_args.
_init_distributed
(
world
,
rank
)
Utils
.
initialize_model_parallel
()
model
=
torch
.
nn
.
Linear
(
100
,
100
,
bias
=
False
,
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
model
.
requires_grad_
(
True
)
model
.
weight
.
data
.
fill_
(
1.0
)
ddp_config
=
DistributedDataParallelConfig
(
use_distributed_optimizer
=
use_distributed_optimizer
)
model
=
DistributedDataParallel
(
TransformerConfig
(
num_attention_heads
=
1
,
num_layers
=
1
),
ddp_config
,
model
)
for
param
in
model
.
parameters
():
assert
param
.
requires_grad
if
precision
==
'bf16'
:
optimizer_config
=
OptimizerConfig
(
optimizer
=
'adam'
,
bf16
=
True
,
use_distributed_optimizer
=
use_distributed_optimizer
)
elif
precision
==
'fp32'
:
optimizer_config
=
OptimizerConfig
(
optimizer
=
'adam'
,
bf16
=
False
,
fp16
=
False
,
use_distributed_optimizer
=
use_distributed_optimizer
,
)
optim
=
get_megatron_optimizer
(
optimizer_config
,
[
model
])
model_sharded_state_dict
=
model
.
sharded_state_dict
()
sharded_state_dict
=
optim
.
sharded_state_dict
(
model_sharded_state_dict
)
if
'optimizer'
in
sharded_state_dict
and
'state'
in
sharded_state_dict
[
'optimizer'
]:
assert
(
'common_step'
not
in
sharded_state_dict
[
'optimizer'
][
'state'
]
or
sharded_state_dict
[
'optimizer'
][
'state'
][
'common_step'
]
is
not
None
),
"Found 'optimizer.state.common_step=None' in sharded state dict."
tests/unit_tests/test_optimizer_cpu_offloading.py
0 → 100644
View file @
688448db
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
import
random
import
numpy
as
np
import
pytest
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.optim
import
SGD
,
Adam
try
:
from
transformer_engine.pytorch.optimizers
import
FusedAdam
as
GPUAdam
from
transformer_engine.pytorch.optimizers
import
FusedSGD
as
GPUSGD
except
:
# Handle environment where transformer_engine is not installed
from
torch.optim
import
SGD
as
GPUSGD
from
torch.optim
import
Adam
as
GPUAdam
from
megatron.core.optimizer.cpu_offloading
import
HybridDeviceOptimizer
class
Net
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
self
.
conv1
=
nn
.
Conv2d
(
3
,
6
,
5
)
self
.
pool
=
nn
.
MaxPool2d
(
2
,
2
)
self
.
conv2
=
nn
.
Conv2d
(
6
,
16
,
5
)
self
.
fc1
=
nn
.
Linear
(
16
*
5
*
5
,
120
)
self
.
fc2
=
nn
.
Linear
(
120
,
84
)
self
.
fc3
=
nn
.
Linear
(
84
,
10
)
def
forward
(
self
,
x
):
x
=
self
.
pool
(
F
.
relu
(
self
.
conv1
(
x
)))
x
=
self
.
pool
(
F
.
relu
(
self
.
conv2
(
x
)))
x
=
torch
.
flatten
(
x
,
1
)
# flatten all dimensions except batch
x
=
F
.
relu
(
self
.
fc1
(
x
))
x
=
F
.
relu
(
self
.
fc2
(
x
))
x
=
self
.
fc3
(
x
)
return
x
def
setup_seed
(
seed
):
random
.
seed
(
seed
)
# Set Python's built-in random seed
np
.
random
.
seed
(
seed
)
# Set NumPy's random seed
torch
.
manual_seed
(
seed
)
# Set PyTorch's CPU seed
torch
.
cuda
.
manual_seed
(
seed
)
# Set PyTorch's GPU seed (if using CUDA)
torch
.
cuda
.
manual_seed_all
(
seed
)
# Set seed for all GPUs
torch
.
backends
.
cudnn
.
deterministic
=
True
# Ensure deterministic behavior
torch
.
backends
.
cudnn
.
benchmark
=
False
# Disable auto-tuner for reproducibility
@
pytest
.
mark
.
skipif
(
torch
.
__version__
<
'2.3.0'
,
reason
=
(
"Requires PyTorch 2.3.0 or higher, lower versions of pytorch have "
"misaligned optimizer accuracy for CPU and GPU."
),
)
@
pytest
.
mark
.
parametrize
(
'n_steps'
,
[
1
,
10
])
@
pytest
.
mark
.
parametrize
(
'overlap_cpu_optimizer_d2h_h2d'
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
'offload_fraction'
,
[
0
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
'optimizer'
,
[
'sgd'
,
'adam'
])
@
pytest
.
mark
.
parametrize
(
'with_param_groups'
,
[
False
,
True
])
def
test_multi_device_hybrid_optimizer
(
with_param_groups
,
optimizer
,
offload_fraction
,
overlap_cpu_optimizer_d2h_h2d
,
n_steps
):
setup_seed
(
42
)
net1
=
Net
().
cuda
()
net2
=
Net
().
cuda
()
net2
.
load_state_dict
(
net1
.
state_dict
())
base_lr
=
1e-3
params
=
list
(
net1
.
parameters
())
ref_params
=
list
(
net2
.
parameters
())
if
with_param_groups
:
param_groups
=
[
{
"params"
:
params
[:
len
(
params
)
//
2
],
"wd_mult"
:
1.0
,
"lr_mult"
:
1e-4
},
{
"params"
:
params
[
len
(
params
)
//
2
:],
"wd_mult"
:
0.0
,
"lr_mult"
:
2e-4
},
]
params
=
param_groups
ref_param_groups
=
[
{
"params"
:
ref_params
[:
len
(
ref_params
)
//
2
],
"wd_mult"
:
1.0
,
"lr_mult"
:
1e-4
},
{
"params"
:
ref_params
[
len
(
ref_params
)
//
2
:],
"wd_mult"
:
0.0
,
"lr_mult"
:
2e-4
},
]
ref_params
=
ref_param_groups
if
optimizer
==
'adam'
:
cls_kwargs
=
dict
(
cpu_optimizer_cls
=
Adam
,
gpu_optimizer_cls
=
GPUAdam
)
else
:
cls_kwargs
=
dict
(
cpu_optimizer_cls
=
SGD
,
gpu_optimizer_cls
=
GPUSGD
)
hdo
=
HybridDeviceOptimizer
(
params
,
offload_fraction
=
offload_fraction
,
lr
=
base_lr
,
overlap_cpu_optimizer_d2h_h2d
=
overlap_cpu_optimizer_d2h_h2d
,
**
cls_kwargs
,
)
ref_optimizer
=
cls_kwargs
[
'gpu_optimizer_cls'
](
ref_params
,
lr
=
base_lr
)
# 1. run step on optimizer, make sure there is state generated
assert
len
(
hdo
.
state_dict
()[
"state"
])
==
0
# state is empty
input
=
torch
.
randn
(
1
,
3
,
32
,
32
).
cuda
()
output
=
net1
(
input
)
output
.
sum
().
backward
()
hdo
.
step
()
output
=
net2
(
input
)
output
.
sum
().
backward
()
ref_optimizer
.
step
()
# PyTorch SGD will not generate state
if
optimizer
!=
'sgd'
:
assert
len
(
hdo
.
state_dict
()[
"state"
])
!=
0
# 2. check the state is on right device
if
optimizer
==
'adam'
:
first_param_id
=
hdo
.
state_dict
()[
"param_groups"
][
0
][
"params"
][
0
]
last_param_id
=
hdo
.
state_dict
()[
"param_groups"
][
-
1
][
"params"
][
-
1
]
if
offload_fraction
>
0
:
assert
not
hdo
.
state_dict
()[
"state"
][
first_param_id
][
"exp_avg"
].
is_cuda
if
offload_fraction
<
1
:
assert
hdo
.
state_dict
()[
"state"
][
last_param_id
][
"exp_avg"
].
is_cuda
# 3. check parameters allclose
for
_
in
range
(
1
,
n_steps
):
input
=
torch
.
randn
(
1
,
3
,
32
,
32
).
cuda
()
output
=
net1
(
input
)
output
.
sum
().
backward
()
hdo
.
step
()
output
=
net2
(
input
)
output
.
sum
().
backward
()
ref_optimizer
.
step
()
params
=
net1
.
state_dict
()
ref_params
=
net2
.
state_dict
()
for
k
,
v
in
params
.
items
():
assert
(
v
.
isnan
()
==
ref_params
[
k
].
isnan
()).
all
()
torch
.
nan_to_num_
(
v
,
0
)
torch
.
nan_to_num_
(
ref_params
[
k
],
0
)
assert
torch
.
allclose
(
v
,
ref_params
[
k
],
atol
=
1e-03
),
f
"Weight
{
k
}
value mismatch, max error:
{
(
v
-
ref_params
[
k
]).
abs
().
max
()
}
"
tests/unit_tests/test_parallel_state.py
View file @
688448db
...
...
@@ -178,12 +178,17 @@ def test_encoder_tensor_pipeline_parallelism(order):
if
rank
<
2
:
assert
ps
.
get_tensor_model_parallel_world_size
()
==
3
assert
isinstance
(
ps
.
_PIPELINE_GLOBAL_RANKS
[
0
],
list
)
last_ranks
=
ps
.
get_pipeline_model_parallel_last_rank
()
assert
isinstance
(
last_ranks
,
list
)
assert
len
(
last_ranks
)
==
2
elif
rank
==
2
:
assert
ps
.
get_tensor_model_parallel_world_size
()
==
3
assert
isinstance
(
ps
.
_PIPELINE_GLOBAL_RANKS
[
0
],
int
)
assert
isinstance
(
ps
.
get_pipeline_model_parallel_last_rank
(),
int
)
else
:
assert
ps
.
get_tensor_model_parallel_world_size
()
==
5
assert
isinstance
(
ps
.
_PIPELINE_GLOBAL_RANKS
[
0
],
int
)
assert
isinstance
(
ps
.
get_pipeline_model_parallel_last_rank
(),
int
)
Utils
.
destroy_model_parallel
()
...
...
@@ -255,6 +260,8 @@ def test_different_initialize_order_consistency(src_tp_pp, ep_size):
'src_tp_pp, ep_size'
,
[((
1
,
2
),
1
),
((
1
,
4
),
1
),
((
2
,
2
),
1
),
((
1
,
2
),
2
),
((
1
,
4
),
2
),
((
2
,
2
),
2
)],
)
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_different_initialize_order_unconsistency
(
src_tp_pp
,
ep_size
):
Utils
.
initialize_model_parallel
(
*
src_tp_pp
,
expert_model_parallel_size
=
ep_size
,
order
=
'tp-ep-dp-pp'
...
...
tests/unit_tests/test_utils.py
View file @
688448db
import
os
import
time
import
urllib.request
as
req
from
types
import
SimpleNamespace
import
mock
import
numpy
as
np
import
pytest
import
torch
import
megatron.core.utils
as
util
import
megatron.training.utils
as
training_util
from
megatron.core.distributed
import
DistributedDataParallel
,
DistributedDataParallelConfig
from
megatron.core.optimizer
import
OptimizerConfig
,
get_megatron_optimizer
from
megatron.core.transformer
import
TransformerConfig
from
tests.unit_tests.test_utilities
import
Utils
...
...
@@ -65,6 +71,7 @@ def _deinit_distributed():
torch
.
distributed
.
barrier
()
@
pytest
.
mark
.
flaky_in_dev
def
test_check_param_hashes_across_dp_replicas
():
world
=
int
(
os
.
getenv
(
'WORLD_SIZE'
,
'1'
))
rank
=
int
(
os
.
getenv
(
'RANK'
,
'0'
))
...
...
@@ -72,7 +79,7 @@ def test_check_param_hashes_across_dp_replicas():
# Setup.
_init_distributed
(
world
,
rank
)
Utils
.
initialize_model_parallel
()
model
=
torch
.
nn
.
Linear
(
100
,
100
,
bias
=
False
)
model
=
torch
.
nn
.
Linear
(
100
,
100
,
bias
=
False
,
device
=
'cuda'
)
# First check case where all replicas agree.
model
.
weight
.
data
.
fill_
(
1.0
)
...
...
@@ -89,6 +96,7 @@ def test_check_param_hashes_across_dp_replicas():
_deinit_distributed
()
@
pytest
.
mark
.
flaky_in_dev
def
test_cross_check_param_hashes_across_dp_replicas
():
world
=
int
(
os
.
getenv
(
'WORLD_SIZE'
,
'1'
))
rank
=
int
(
os
.
getenv
(
'RANK'
,
'0'
))
...
...
@@ -96,7 +104,7 @@ def test_cross_check_param_hashes_across_dp_replicas():
# Setup.
_init_distributed
(
world
,
rank
)
Utils
.
initialize_model_parallel
()
model
=
torch
.
nn
.
Linear
(
100
,
100
,
bias
=
False
)
model
=
torch
.
nn
.
Linear
(
100
,
100
,
bias
=
False
,
device
=
'cuda'
)
# First check case where all replicas agree.
model
.
weight
.
data
.
fill_
(
1.0
)
...
...
@@ -111,6 +119,57 @@ def test_cross_check_param_hashes_across_dp_replicas():
_deinit_distributed
()
@
pytest
.
mark
.
parametrize
(
"use_distributed_optimizer"
,
[
False
,
True
])
@
pytest
.
mark
.
flaky_in_dev
def
test_param_norm
(
use_distributed_optimizer
:
bool
):
world
=
int
(
os
.
getenv
(
'WORLD_SIZE'
,
'1'
))
rank
=
int
(
os
.
getenv
(
'RANK'
,
'0'
))
# Setup: distributed, model, mock_args.
_init_distributed
(
world
,
rank
)
Utils
.
initialize_model_parallel
()
model
=
torch
.
nn
.
Linear
(
100
,
100
,
bias
=
False
,
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
model
.
requires_grad_
(
True
)
model
.
weight
.
data
.
fill_
(
1.0
)
ddp_config
=
DistributedDataParallelConfig
(
use_distributed_optimizer
=
use_distributed_optimizer
)
# Use dummy TransformerConfig which doesn't trigger __post_init__ assertions.
model
=
DistributedDataParallel
(
TransformerConfig
(
num_attention_heads
=
1
,
num_layers
=
1
),
ddp_config
,
model
)
for
param
in
model
.
parameters
():
assert
param
.
requires_grad
mock_args
=
SimpleNamespace
(
bf16
=
True
)
with
mock
.
patch
(
'megatron.training.utils.get_args'
,
new
=
lambda
:
mock_args
):
# Make sure norm is correct when `main_param` attribute is not available.
assert
training_util
.
calc_params_l2_norm
(
model
,
force_create_fp32_copy
=
False
)
==
pytest
.
approx
(
100.0
)
assert
training_util
.
calc_params_l2_norm
(
model
,
force_create_fp32_copy
=
True
)
==
pytest
.
approx
(
100.0
)
# Make sure norm is correct when `main_param` attribute is available.
optimizer_config
=
OptimizerConfig
(
bf16
=
True
,
use_distributed_optimizer
=
use_distributed_optimizer
)
_
=
get_megatron_optimizer
(
optimizer_config
,
[
model
])
for
param
in
model
.
parameters
():
assert
hasattr
(
param
,
'main_param'
)
if
use_distributed_optimizer
:
assert
getattr
(
param
,
'main_param_sharded'
,
False
)
assert
training_util
.
calc_params_l2_norm
(
model
,
force_create_fp32_copy
=
False
)
==
pytest
.
approx
(
100.0
)
assert
training_util
.
calc_params_l2_norm
(
model
,
force_create_fp32_copy
=
True
)
==
pytest
.
approx
(
100.0
)
# Teardown.
_deinit_distributed
()
@
pytest
.
mark
.
flaky_in_dev
def
test_straggler_detector
():
world
=
int
(
os
.
getenv
(
'WORLD_SIZE'
,
'1'
))
rank
=
int
(
os
.
getenv
(
'RANK'
,
'0'
))
...
...
tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
View file @
688448db
...
...
@@ -4,7 +4,10 @@ import pytest
import
torch
from
tests.unit_tests.test_utilities
import
Utils
from
tests.unit_tests.transformer.moe.test_token_dispatcher
import
MoEModelTestContainer
from
tests.unit_tests.transformer.moe.test_token_dispatcher
import
(
MoEModelTestContainer
,
permute_fusion_params
,
)
def
test_placeholder
():
...
...
@@ -12,7 +15,6 @@ def test_placeholder():
pass
@
pytest
.
mark
.
flaky
class
TestAlltoAllDispatcher
:
def
setup_method
(
self
,
method
):
pass
...
...
@@ -24,9 +26,8 @@ class TestAlltoAllDispatcher:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
timeout
(
120
)
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
1
,
8
),
(
8
,
1
),
(
4
,
2
),
(
1
,
1
)])
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_forward_backward
(
self
,
tp_size
,
ep_size
):
@
pytest
.
mark
.
parametrize
(
"permute_fusion"
,
permute_fusion_params
)
def
test_forward_backward
(
self
,
tp_size
,
ep_size
,
permute_fusion
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
ep_size
=
ep_size
,
...
...
@@ -35,6 +36,7 @@ class TestAlltoAllDispatcher:
moe_router_topk
=
2
,
moe_router_load_balancing_type
=
"aux_loss"
,
moe_token_dispatcher_type
=
"alltoall"
,
moe_permute_fusion
=
permute_fusion
,
)
container
.
dispatcher_dropless_test
()
...
...
@@ -42,8 +44,6 @@ class TestAlltoAllDispatcher:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
timeout
(
120
)
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
1
,
8
),
(
8
,
1
),
(
4
,
2
),
(
1
,
1
)])
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_a2aseq_forward_backward
(
self
,
tp_size
,
ep_size
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
...
...
@@ -53,6 +53,7 @@ class TestAlltoAllDispatcher:
moe_router_topk
=
2
,
moe_router_load_balancing_type
=
"aux_loss"
,
moe_token_dispatcher_type
=
"alltoall_seq"
,
moe_permute_fusion
=
False
,
)
container
.
dispatcher_dropless_test
()
...
...
@@ -60,9 +61,8 @@ class TestAlltoAllDispatcher:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
timeout
(
120
)
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
1
,
8
),
(
8
,
1
),
(
4
,
2
),
(
1
,
1
)])
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_capacity_forward_backward
(
self
,
tp_size
,
ep_size
):
@
pytest
.
mark
.
parametrize
(
"permute_fusion"
,
permute_fusion_params
)
def
test_capacity_forward_backward
(
self
,
tp_size
,
ep_size
,
permute_fusion
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
ep_size
=
ep_size
,
...
...
@@ -74,6 +74,7 @@ class TestAlltoAllDispatcher:
moe_token_drop_policy
=
"probs"
,
moe_expert_capacity_factor
=
0.5
,
moe_pad_expert_input_to_capacity
=
False
,
moe_permute_fusion
=
permute_fusion
,
)
container
.
dispatcher_capacity_test
()
...
...
@@ -81,9 +82,8 @@ class TestAlltoAllDispatcher:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
timeout
(
120
)
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
1
,
8
),
(
8
,
1
),
(
4
,
2
),
(
1
,
1
)])
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_capacity_padding_forward_backward
(
self
,
tp_size
,
ep_size
):
@
pytest
.
mark
.
parametrize
(
"permute_fusion"
,
permute_fusion_params
)
def
test_capacity_padding_forward_backward
(
self
,
tp_size
,
ep_size
,
permute_fusion
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
ep_size
=
ep_size
,
...
...
@@ -95,5 +95,6 @@ class TestAlltoAllDispatcher:
moe_token_drop_policy
=
"probs"
,
moe_expert_capacity_factor
=
0.6
,
moe_pad_expert_input_to_capacity
=
True
,
moe_permute_fusion
=
permute_fusion
,
)
container
.
dispatcher_drop_and_pad_test
()
tests/unit_tests/transformer/moe/test_aux_loss.py
View file @
688448db
...
...
@@ -12,7 +12,7 @@ from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestC
class
AuxlossTestContainer
(
MoEModelTestContainer
):
def
partition_input
(
self
,
input
):
partitioned_input
=
input
.
chunk
(
parallel_state
.
get_tensor_and_context_parallel_world_size
(),
dim
=
1
parallel_state
.
get_tensor_and_context_parallel_world_size
(),
dim
=
0
)[
parallel_state
.
get_tensor_and_context_parallel_rank
()]
output
=
partitioned_input
.
clone
().
detach
()
output
.
requires_grad
=
True
...
...
@@ -126,7 +126,9 @@ class TestSeqAuxLoss:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size,cp_size"
,
[(
1
,
8
,
1
)])
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size,cp_size"
,
[(
8
,
1
,
1
),
(
4
,
2
,
1
),
(
1
,
1
,
8
),
(
2
,
1
,
4
),
(
2
,
2
,
2
)]
)
def
test_a2a_dispatcher
(
self
,
tp_size
,
ep_size
,
cp_size
):
container
=
AuxlossTestContainer
(
tp_size
=
tp_size
,
...
...
tests/unit_tests/transformer/moe/test_moe_layer.py
View file @
688448db
...
...
@@ -80,7 +80,7 @@ class TestMoELayerInit:
)
Utils
.
destroy_model_parallel
()
@
pytest
.
mark
.
parametrize
(
"moe_token_dispatcher_type"
,
[
"all
gather
"
,
"all
toall
"
])
@
pytest
.
mark
.
parametrize
(
"moe_token_dispatcher_type"
,
[
"all
toall
"
,
"all
gather
"
])
@
pytest
.
mark
.
parametrize
(
"grouped_gemm"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
1
,
1
),
(
2
,
2
)])
def
test_moe_with_late_initialize
(
...
...
tests/unit_tests/transformer/moe/test_routers.py
View file @
688448db
...
...
@@ -5,6 +5,7 @@ import torch
from
megatron.core.models.gpt.gpt_layer_specs
import
get_gpt_layer_local_spec
from
megatron.core.transformer.moe.moe_layer
import
MoELayer
from
megatron.core.transformer.moe.moe_utils
import
get_updated_expert_bias
from
megatron.core.transformer.moe.router
import
Router
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.training.initialize
import
_set_random_seed
...
...
@@ -47,12 +48,13 @@ class TestTop2Router:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
parametrize
(
"moe_router_pre_softmax"
,
[(
True
),
(
False
)])
def
test_router_forward
(
self
,
moe_router_pre_softmax
):
@
pytest
.
mark
.
parametrize
(
"score_function"
,
[
"sigmoid"
,
"softmax"
])
def
test_router_forward
(
self
,
moe_router_pre_softmax
,
score_function
):
with
torch
.
no_grad
():
self
.
router
=
self
.
router
.
cuda
()
self
.
router
.
config
.
moe_router_pre_softmax
=
moe_router_pre_softmax
self
.
router
.
config
.
moe_router_score_function
=
score_function
# [num tokens, hidden size]
hidden_states
=
torch
.
randn
((
32
,
2
,
self
.
router
.
config
.
hidden_size
))
hidden_states
=
hidden_states
.
cuda
()
...
...
@@ -60,7 +62,6 @@ class TestTop2Router:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
internal
def
test_aux_loss
(
self
):
self
.
sequential_mlp
=
self
.
sequential_mlp
.
cuda
()
...
...
@@ -86,60 +87,149 @@ class TestTop2Router:
assert
self
.
sequential_mlp
.
router
.
weight
.
grad
.
abs
().
sum
()
>
0
class
Test
Device
Limited
Top2
Router
:
class
Test
Group
LimitedRouter
:
def
setup_method
(
self
,
method
):
Utils
.
initialize_model_parallel
(
1
,
1
,
expert_model_parallel_size
=
8
)
Utils
.
initialize_model_parallel
(
tensor_model_parallel_size
=
1
,
pipeline_model_parallel_size
=
1
,
expert_model_parallel_size
=
8
,
context_parallel_size
=
1
,
)
_set_random_seed
(
seed_
=
123
,
data_parallel_random_init
=
False
)
print
(
"done intializing"
)
num_moe_experts
=
8
num_moe_experts
=
16
self
.
transformer_config
=
TransformerConfig
(
tensor_model_parallel_size
=
1
,
pipeline_model_parallel_size
=
1
,
expert_model_parallel_size
=
8
,
context_parallel_size
=
1
,
num_moe_experts
=
num_moe_experts
,
moe_router_topk
=
4
,
moe_router_group_topk
=
2
,
moe_router_num_groups
=
8
,
moe_router_pre_softmax
=
True
,
moe_router_load_balancing_type
=
"aux_loss"
,
moe_aux_loss_coeff
=
0
,
moe_token_dispatcher_type
=
"alltoall"
,
num_layers
=
2
,
hidden_size
=
12
,
num_attention_heads
=
4
,
num_moe_experts
=
num_moe_experts
,
use_cpu_initialization
=
True
,
expert_model_parallel_size
=
8
,
moe_router_load_balancing_type
=
"aux_loss"
,
moe_router_topk_limited_devices
=
2
,
moe_router_pre_softmax
=
True
,
moe_router_topk
=
2
,
moe_aux_loss_coeff
=
0
,
)
# init MoE layer
transformer_layer_spec
=
get_gpt_layer_local_spec
(
num_experts
=
num_moe_experts
,
moe_grouped_gemm
=
False
)
self
.
sequential_mlp
=
MoELayer
(
self
.
moe_layer
=
MoELayer
(
self
.
transformer_config
,
transformer_layer_spec
.
submodules
.
mlp
.
submodules
)
self
.
router
=
self
.
sequential_mlp
.
router
)
.
cuda
()
self
.
router
=
self
.
moe_layer
.
router
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
@
pytest
.
mark
.
internal
def
test_constructor
(
self
):
assert
isinstance
(
self
.
router
,
Router
)
num_weights
=
sum
([
p
.
numel
()
for
p
in
self
.
router
.
parameters
()])
assert
num_weights
==
12
*
8
,
num_weights
assert
(
num_weights
==
self
.
transformer_config
.
hidden_size
*
self
.
transformer_config
.
num_moe_experts
),
num_weights
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
parametrize
(
"moe_router_group_topk,moe_router_num_groups"
,
[(
3
,
8
),
(
2
,
4
)])
@
pytest
.
mark
.
parametrize
(
"moe_router_pre_softmax"
,
[(
True
),
(
False
)])
def
test_router_forward
(
self
,
moe_router_pre_softmax
):
@
pytest
.
mark
.
parametrize
(
"score_function"
,
[
"sigmoid"
,
"softmax"
])
def
test_router_forward
(
self
,
moe_router_group_topk
,
moe_router_num_groups
,
moe_router_pre_softmax
,
score_function
):
with
torch
.
no_grad
():
self
.
router
=
self
.
router
.
cuda
()
self
.
router
.
config
.
moe_router_group_topk
=
moe_router_group_topk
self
.
router
.
config
.
moe_router_num_groups
=
moe_router_num_groups
self
.
router
.
config
.
moe_router_pre_softmax
=
moe_router_pre_softmax
self
.
router
.
config
.
moe_router_score_function
=
score_function
if
moe_router_pre_softmax
:
self
.
router
.
config
.
moe_router_topk_scaling_factor
=
16.0
# [num tokens, hidden size]
seq_len
=
2
batch_size
=
2
num_tokens
=
seq_len
*
batch_size
# hidden_states shape: [seq_len, batch_size, hidden_size]
hidden_states
=
torch
.
randn
(
(
seq_len
,
batch_size
,
self
.
router
.
config
.
hidden_size
)
).
cuda
()
scores
,
routing_map
=
self
.
router
(
hidden_states
)
assert
scores
.
shape
==
(
num_tokens
,
self
.
router
.
config
.
num_moe_experts
),
scores
.
shape
assert
routing_map
.
shape
==
(
num_tokens
,
self
.
router
.
config
.
num_moe_experts
,
),
routing_map
.
shape
group_routing_map
=
(
routing_map
.
reshape
(
num_tokens
,
moe_router_num_groups
,
-
1
).
max
(
dim
=-
1
).
values
)
assert
torch
.
all
(
group_routing_map
.
sum
(
dim
=-
1
)
<=
moe_router_group_topk
)
class
TestAuxLossFreeTop2Router
:
def
setup_method
(
self
,
method
):
Utils
.
initialize_model_parallel
(
1
,
1
,
expert_model_parallel_size
=
8
)
_set_random_seed
(
seed_
=
123
,
data_parallel_random_init
=
False
)
print
(
"done intializing"
)
num_moe_experts
=
8
self
.
transformer_config
=
TransformerConfig
(
num_layers
=
2
,
hidden_size
=
12
,
num_attention_heads
=
4
,
num_moe_experts
=
num_moe_experts
,
use_cpu_initialization
=
True
,
expert_model_parallel_size
=
8
,
moe_router_load_balancing_type
=
"none"
,
# No aux loss
moe_router_score_function
=
"sigmoid"
,
# Using sigmoid scoring
moe_router_enable_expert_bias
=
True
,
# Enable expert bias
moe_router_bias_update_rate
=
0.1
,
# Set bias update rate
moe_router_topk
=
2
,
)
transformer_layer_spec
=
get_gpt_layer_local_spec
(
num_experts
=
num_moe_experts
,
moe_grouped_gemm
=
False
)
self
.
moe_layer
=
MoELayer
(
self
.
transformer_config
,
transformer_layer_spec
.
submodules
.
mlp
.
submodules
)
self
.
router
=
self
.
moe_layer
.
router
assert
self
.
router
.
expert_bias
is
not
None
assert
self
.
router
.
local_tokens_per_expert
is
not
None
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
def
test_router_forward_aux_free
(
self
):
hidden_states
=
torch
.
randn
((
32
,
2
,
self
.
router
.
config
.
hidden_size
))
hidden_states
=
hidden_states
.
cuda
()
scores
,
indices
=
self
.
router
(
hidden_states
)
print
(
scores
.
shape
,
indices
.
shape
)
assert
scores
.
shape
==
(
64
,
8
)
assert
indices
.
shape
==
(
64
,
8
)
print
(
(
indices
==
0
).
sum
(),
(
indices
==
1
).
sum
(),
(
indices
==
2
).
sum
(),
(
indices
==
3
).
sum
(),
self
.
router
=
self
.
router
.
cuda
()
# First forward pass
initial_bias
=
self
.
router
.
expert_bias
.
clone
()
scores1
,
indices1
=
self
.
router
(
hidden_states
)
initial_tokens
=
self
.
router
.
local_tokens_per_expert
.
clone
()
updated_bias
=
get_updated_expert_bias
(
self
.
router
.
local_tokens_per_expert
,
self
.
router
.
expert_bias
,
self
.
router
.
config
.
moe_router_bias_update_rate
,
)
# Verify expert bias was updated
assert
not
torch
.
equal
(
initial_bias
,
updated_bias
),
"Expert bias should be updated"
# Basic output checks
assert
scores1
.
shape
==
(
64
,
8
),
"Router scores shape mismatch"
assert
indices1
.
shape
==
(
64
,
8
),
"Router indices shape mismatch"
# Print some debug info
print
(
"Updated bias after first forward pass:"
,
updated_bias
)
tests/unit_tests/transformer/moe/test_token_dispatcher.py
View file @
688448db
...
...
@@ -8,8 +8,8 @@ import torch
from
megatron.core
import
parallel_state
from
megatron.core.models.gpt.gpt_layer_specs
import
get_gpt_layer_local_spec
from
megatron.core.transformer.moe.moe_layer
import
MoELayer
from
megatron.core.transformer.moe.moe_utils
import
permute
,
unpermute
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.core.utils
import
is_te_min_version
from
megatron.training.initialize
import
_set_random_seed
from
tests.unit_tests.test_utilities
import
Utils
...
...
@@ -69,6 +69,8 @@ class MoEModelTestContainer:
use_cpu_initialization
=
kwargs
.
get
(
"use_cpu_initialization"
,
True
),
sequence_parallel
=
tp_size
>
1
,
add_bias_linear
=
kwargs
.
get
(
"add_bias_linear"
,
False
),
moe_permute_fusion
=
kwargs
.
get
(
"moe_permute_fusion"
,
False
),
moe_enable_deepep
=
kwargs
.
get
(
"moe_enable_deepep"
,
False
),
)
# init moe layer
...
...
@@ -94,31 +96,30 @@ class MoEModelTestContainer:
moe_layer
=
self
.
moe_layer
bs
=
32
seql
=
8
# TODO: Find why setting manual seed can cause the test to fail
# Manual seed to differentiate input data for each rank
# rank = torch.distributed.get_rank()
# torch.manual_seed(1000 + rank)
hidden_states
=
torch
.
randn
((
bs
,
seql
,
moe_layer
.
config
.
hidden_size
))
hidden_states
=
hidden_states
.
cuda
()
ans
=
hidden_states
/
2
# Permute and then unpermute data are supposed to restore original data
ans
=
hidden_states
hidden_states
.
requires_grad
=
True
probs
,
indices
=
moe_layer
.
router
(
hidden_states
)
probs
=
torch
.
ones_like
(
probs
)
/
moe_layer
.
router
.
topk
/
2
## Uncomment these lines to assist in bug location.
# hidden_states = torch.ones_like(hidden_states) * torch.distributed.get_rank()
# hidden_states.requires_grad = True
# indices = torch.ones_like(indices) * torch.distributed.get_rank()
# print(permuted_local_hidden_states)
probs
=
torch
.
ones_like
(
probs
)
/
moe_layer
.
router
.
topk
(
permuted_local_hidden_states
,
tokens_per_expert
)
=
(
moe_layer
.
token_dispatcher
.
token_permutation
(
hidden_states
,
probs
,
indices
)
)
scale
=
moe_layer
.
config
.
expert_tensor_parallel_size
permuted_local_hidden_states
/=
scale
restored_hidden_states
,
restored_bias
=
moe_layer
.
token_dispatcher
.
token_unpermutation
(
permuted_local_hidden_states
)
# reduce across TP rank equals to multiply data by a scale of ETP
scale
=
moe_layer
.
config
.
expert_tensor_parallel_size
restored_hidden_states
=
restored_hidden_states
/
scale
assert
torch
.
allclose
(
restored_hidden_states
,
ans
),
"Restored hidden states do not match original hidden states"
...
...
@@ -147,8 +148,6 @@ class MoEModelTestContainer:
moe_layer
.
token_dispatcher
.
token_permutation
(
hidden_states
,
probs
,
indices
)
)
print
(
f
"Dispatched tokens per expert:
{
tokens_per_expert
}
"
)
permuted_local_hidden_states
/=
moe_layer
.
config
.
tensor_model_parallel_size
restored_hidden_states
,
restored_bias
=
moe_layer
.
token_dispatcher
.
token_unpermutation
(
...
...
@@ -220,6 +219,11 @@ class MoEModelTestContainer:
Utils
.
destroy_model_parallel
()
permute_fusion_params
=
[
False
]
if
is_te_min_version
(
"1.14.0"
):
permute_fusion_params
.
append
(
True
)
class
TestAllgatherDispatcher
:
def
setup_method
(
self
,
method
):
pass
...
...
@@ -231,9 +235,8 @@ class TestAllgatherDispatcher:
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
8
,
1
),
(
1
,
8
),
(
2
,
4
),
(
1
,
1
)])
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_forward_backward
(
self
,
tp_size
,
ep_size
):
@
pytest
.
mark
.
parametrize
(
"permute_fusion"
,
permute_fusion_params
)
def
test_forward_backward
(
self
,
tp_size
,
ep_size
,
permute_fusion
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
ep_size
=
ep_size
,
...
...
@@ -242,6 +245,7 @@ class TestAllgatherDispatcher:
moe_router_topk
=
2
,
moe_router_load_balancing_type
=
"aux_loss"
,
moe_token_dispatcher_type
=
"allgather"
,
moe_permute_fusion
=
permute_fusion
,
)
container
.
dispatcher_dropless_test
()
...
...
@@ -249,12 +253,11 @@ class TestAllgatherDispatcher:
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
parametrize
(
"permute_fusion"
,
permute_fusion_params
)
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size,moe_tp_size"
,
[(
1
,
1
,
8
),
(
1
,
2
,
4
),
(
1
,
4
,
2
),
(
2
,
2
,
4
)]
)
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_moe_tp_forward_backward
(
self
,
tp_size
,
ep_size
,
moe_tp_size
):
def
test_moe_tp_forward_backward
(
self
,
tp_size
,
ep_size
,
moe_tp_size
,
permute_fusion
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
ep_size
=
ep_size
,
...
...
@@ -266,7 +269,93 @@ class TestAllgatherDispatcher:
moe_token_dispatcher_type
=
"allgather"
,
sequence_parallel
=
True
,
moe_grouped_gemm
=
True
,
moe_permute_fusion
=
permute_fusion
,
use_cpu_initialization
=
False
,
)
container
.
dispatcher_dropless_test
()
def
is_deep_ep_available
():
from
megatron.core.transformer.moe.fused_a2a
import
HAVE_DEEP_EP
return
HAVE_DEEP_EP
@
pytest
.
mark
.
skipif
(
not
is_deep_ep_available
(),
reason
=
"Deep EP is not available"
)
class
TestFlexDispatcher
:
def
setup_method
(
self
,
method
):
pass
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
8
,
1
),
(
1
,
8
),
(
2
,
4
)])
@
pytest
.
mark
.
parametrize
(
"permute_fusion"
,
permute_fusion_params
)
def
test_forward_backward
(
self
,
tp_size
,
ep_size
,
permute_fusion
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
ep_size
=
ep_size
,
pp_size
=
1
,
num_moe_experts
=
8
,
moe_router_topk
=
2
,
moe_router_load_balancing_type
=
"aux_loss"
,
moe_token_dispatcher_type
=
"flex"
,
moe_permute_fusion
=
permute_fusion
,
hidden_size
=
4
,
moe_enable_deepep
=
True
,
)
container
.
dispatcher_dropless_test
()
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
timeout
(
120
)
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
1
,
8
),
(
8
,
1
),
(
4
,
2
)])
@
pytest
.
mark
.
parametrize
(
"permute_fusion"
,
permute_fusion_params
)
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_capacity_forward_backward
(
self
,
tp_size
,
ep_size
,
permute_fusion
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
ep_size
=
ep_size
,
pp_size
=
1
,
num_moe_experts
=
8
,
moe_router_topk
=
2
,
moe_router_load_balancing_type
=
"aux_loss"
,
moe_token_dispatcher_type
=
"flex"
,
moe_token_drop_policy
=
"probs"
,
moe_expert_capacity_factor
=
0.5
,
moe_pad_expert_input_to_capacity
=
False
,
moe_permute_fusion
=
permute_fusion
,
hidden_size
=
4
,
moe_enable_deepep
=
True
,
)
container
.
dispatcher_capacity_test
()
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
internal
@
pytest
.
mark
.
timeout
(
120
)
@
pytest
.
mark
.
parametrize
(
"tp_size,ep_size"
,
[(
1
,
8
),
(
8
,
1
),
(
4
,
2
)])
@
pytest
.
mark
.
parametrize
(
"permute_fusion"
,
permute_fusion_params
)
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_capacity_padding_forward_backward
(
self
,
tp_size
,
ep_size
,
permute_fusion
):
container
=
MoEModelTestContainer
(
tp_size
=
tp_size
,
ep_size
=
ep_size
,
pp_size
=
1
,
num_moe_experts
=
8
,
moe_router_topk
=
2
,
moe_router_load_balancing_type
=
"aux_loss"
,
moe_token_dispatcher_type
=
"flex"
,
moe_token_drop_policy
=
"probs"
,
moe_expert_capacity_factor
=
0.6
,
moe_pad_expert_input_to_capacity
=
True
,
moe_permute_fusion
=
permute_fusion
,
hidden_size
=
4
,
moe_enable_deepep
=
True
,
)
container
.
dispatcher_drop_and_pad_test
()
tests/unit_tests/transformer/test_attention.py
View file @
688448db
...
...
@@ -38,6 +38,8 @@ class TestParallelAttention:
# we can't currently do this because the global memory buffer is on GPU
pass
@
pytest
.
mark
.
flaky
@
pytest
.
mark
.
flaky_in_dev
def
test_gpu_forward
(
self
):
config
=
self
.
parallel_attention
.
config
...
...
@@ -62,6 +64,7 @@ class TestParallelAttention:
assert
output
.
shape
[
2
]
==
config
.
hidden_size
assert
bias
.
shape
[
0
]
==
config
.
hidden_size
@
pytest
.
mark
.
flaky_in_dev
def
test_fused_rope_gpu_forward
(
self
):
self
.
parallel_attention
.
config
.
apply_rope_fusion
=
True
config
=
self
.
parallel_attention
.
config
...
...
@@ -91,6 +94,7 @@ class TestParallelAttention:
assert
bias
.
shape
[
0
]
==
config
.
hidden_size
self
.
parallel_attention
.
config
.
apply_rope_fusion
=
False
@
pytest
.
mark
.
flaky_in_dev
def
test_checkpointed_gpu_forward
(
self
):
transformer_config
=
self
.
transformer_config
transformer_config
.
recompute_granularity
=
'selective'
...
...
tests/unit_tests/transformer/test_multi_latent_attention.py
View file @
688448db
...
...
@@ -2,6 +2,7 @@
import
os
from
importlib.metadata
import
version
from
inspect
import
signature
import
pytest
import
torch
...
...
@@ -9,16 +10,19 @@ import transformer_engine as te
from
megatron.core.models.gpt.gpt_layer_specs
import
get_gpt_layer_with_transformer_engine_spec
from
megatron.core.tensor_parallel.random
import
model_parallel_cuda_manual_seed
from
megatron.core.transformer.attention
import
Attention
from
megatron.core.transformer.enums
import
AttnMaskType
from
megatron.core.transformer.multi_latent_attention
import
MLASelfAttention
from
megatron.core.transformer.multi_latent_attention
import
MLASelfAttention
,
MultiLatentAttention
from
megatron.core.transformer.transformer_config
import
MLATransformerConfig
from
megatron.core.utils
import
is_te_min_version
from
tests.unit_tests.test_utilities
import
Utils
@
pytest
.
mark
.
parametrize
(
"rope_type"
,
(
'yarn'
,
'rope'
))
class
TestParallelMLAAttention
:
def
setup_method
(
self
,
method
):
@
pytest
.
fixture
(
scope
=
'function'
,
autouse
=
True
)
def
setup_and_teardown
(
self
,
rope_type
):
Utils
.
initialize_model_parallel
(
1
,
1
)
model_parallel_cuda_manual_seed
(
123
)
self
.
transformer_config
=
MLATransformerConfig
(
...
...
@@ -31,6 +35,7 @@ class TestParallelMLAAttention:
qk_head_dim
=
128
,
v_head_dim
=
128
,
qk_pos_emb_head_dim
=
64
,
rope_type
=
rope_type
,
rotary_base
=
10000
,
max_position_embeddings
=
32
,
)
...
...
@@ -46,6 +51,19 @@ class TestParallelMLAAttention:
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
def
test_input_params_forward
(
self
):
"""
Test to ensure that MultiLatentAttention has all parameters
required by the Attention class's forward method.
"""
# Extract parameters from the forward methods of both Attention and MultiLatentAttention
attn_params
=
set
(
signature
(
Attention
.
forward
).
parameters
.
keys
())
mla_params
=
set
(
signature
(
MultiLatentAttention
.
forward
).
parameters
.
keys
())
# Identify parameters that are in Attention but missing in MultiLatentAttention
missing_params
=
attn_params
-
mla_params
assert
not
missing_params
,
f
"Missing parameters in MultiLatentAttention:
{
missing_params
}
"
def
test_constructor
(
self
):
assert
isinstance
(
self
.
parallel_attention
,
MLASelfAttention
)
assert
self
.
parallel_attention
.
layer_number
==
1
...
...
@@ -59,11 +77,6 @@ class TestParallelMLAAttention:
def
test_gpu_forward
(
self
):
if
is_te_min_version
(
"1.10.0"
):
# use flash attention for hopper, future may support fused attention for ampere
os
.
environ
[
'NVTE_FUSED_ATTN'
]
=
"0"
os
.
environ
[
'NVTE_FLASH_ATTN'
]
=
"1"
config
=
self
.
parallel_attention
.
config
sequence_length
=
32
micro_batch_size
=
2
...
...
@@ -88,10 +101,6 @@ class TestParallelMLAAttention:
def
test_checkpointed_gpu_forward
(
self
):
if
is_te_min_version
(
"1.10.0"
):
# use flash attention for hopper, future may support fused attention for ampere
os
.
environ
[
'NVTE_FUSED_ATTN'
]
=
"1"
os
.
environ
[
'NVTE_FLASH_ATTN'
]
=
"0"
transformer_config
=
self
.
transformer_config
transformer_config
.
recompute_granularity
=
'selective'
checkpointed_parallel_attention
=
MLASelfAttention
(
...
...
@@ -128,3 +137,117 @@ class TestParallelMLAAttention:
assert
output
.
shape
[
1
]
==
micro_batch_size
assert
output
.
shape
[
2
]
==
config
.
hidden_size
assert
bias
.
shape
[
0
]
==
config
.
hidden_size
class
TestSequenceParallelMLAAttention
:
def
setup_method
(
self
,
method
):
self
.
tensor_parallel_size
=
2
Utils
.
initialize_model_parallel
(
self
.
tensor_parallel_size
,
1
)
model_parallel_cuda_manual_seed
(
123
)
self
.
transformer_config
=
MLATransformerConfig
(
num_layers
=
2
,
hidden_size
=
12
,
num_attention_heads
=
4
,
q_lora_rank
=
32
,
kv_lora_rank
=
32
,
qk_head_dim
=
128
,
v_head_dim
=
128
,
qk_pos_emb_head_dim
=
64
,
rotary_base
=
10000
,
max_position_embeddings
=
64
,
tensor_model_parallel_size
=
self
.
tensor_parallel_size
,
sequence_parallel
=
True
,
)
self
.
parallel_attention
=
MLASelfAttention
(
self
.
transformer_config
,
get_gpt_layer_with_transformer_engine_spec
(
multi_latent_attention
=
True
).
submodules
.
self_attention
.
submodules
,
layer_number
=
1
,
attn_mask_type
=
AttnMaskType
.
causal
,
)
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
def
test_gpu_forward
(
self
):
if
is_te_min_version
(
"1.10.0"
):
config
=
self
.
parallel_attention
.
config
sequence_length
=
64
sub_sequence_length
=
sequence_length
//
self
.
tensor_parallel_size
micro_batch_size
=
2
self
.
parallel_attention
.
cuda
()
# [sequence length, batch size, hidden size]
hidden_states
=
torch
.
ones
(
(
sub_sequence_length
,
micro_batch_size
,
self
.
parallel_attention
.
config
.
hidden_size
)
)
hidden_states
=
hidden_states
.
cuda
()
attention_mask
=
torch
.
ones
((
1
,
1
,
sequence_length
,
sequence_length
),
dtype
=
bool
).
cuda
()
output
,
bias
=
self
.
parallel_attention
(
hidden_states
,
attention_mask
)
assert
config
.
recompute_granularity
is
None
assert
output
.
shape
[
0
]
==
sub_sequence_length
assert
output
.
shape
[
1
]
==
micro_batch_size
assert
output
.
shape
[
2
]
==
config
.
hidden_size
assert
bias
.
shape
[
0
]
==
config
.
hidden_size
class
TestTensorParallelMLAAttention
:
def
setup_method
(
self
,
method
):
self
.
tensor_parallel_size
=
2
Utils
.
initialize_model_parallel
(
self
.
tensor_parallel_size
,
1
)
model_parallel_cuda_manual_seed
(
123
)
self
.
transformer_config
=
MLATransformerConfig
(
num_layers
=
2
,
hidden_size
=
12
,
num_attention_heads
=
4
,
q_lora_rank
=
32
,
kv_lora_rank
=
32
,
qk_head_dim
=
128
,
v_head_dim
=
128
,
qk_pos_emb_head_dim
=
64
,
rotary_base
=
10000
,
max_position_embeddings
=
64
,
tensor_model_parallel_size
=
self
.
tensor_parallel_size
,
sequence_parallel
=
False
,
)
self
.
parallel_attention
=
MLASelfAttention
(
self
.
transformer_config
,
get_gpt_layer_with_transformer_engine_spec
(
multi_latent_attention
=
True
).
submodules
.
self_attention
.
submodules
,
layer_number
=
1
,
attn_mask_type
=
AttnMaskType
.
causal
,
)
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
def
test_gpu_forward
(
self
):
if
is_te_min_version
(
"1.10.0"
):
config
=
self
.
parallel_attention
.
config
sequence_length
=
64
micro_batch_size
=
2
self
.
parallel_attention
.
cuda
()
# [sequence length, batch size, hidden size]
hidden_states
=
torch
.
ones
(
(
sequence_length
,
micro_batch_size
,
self
.
parallel_attention
.
config
.
hidden_size
)
)
hidden_states
=
hidden_states
.
cuda
()
attention_mask
=
torch
.
ones
((
1
,
1
,
sequence_length
,
sequence_length
),
dtype
=
bool
).
cuda
()
output
,
bias
=
self
.
parallel_attention
(
hidden_states
,
attention_mask
)
assert
config
.
recompute_granularity
is
None
assert
output
.
shape
[
0
]
==
sequence_length
assert
output
.
shape
[
1
]
==
micro_batch_size
assert
output
.
shape
[
2
]
==
config
.
hidden_size
assert
bias
.
shape
[
0
]
==
config
.
hidden_size
Prev
1
…
36
37
38
39
40
41
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment