Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
import copy
import os
import random
import string
import time
from collections import OrderedDict
from typing import Dict
from typing import Dict, List
from unittest import mock
import pytest
......@@ -30,7 +31,7 @@ from tests.unit_tests.test_utilities import Utils
class TestTextGenerationController:
def setup_method(self, method):
def setup_model(self, dtype):
Utils.initialize_model_parallel(
tensor_model_parallel_size=2, pipeline_model_parallel_size=2
)
......@@ -58,8 +59,10 @@ class TestTextGenerationController:
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=self.hidden_size,
inference_batch_times_seqlen_threshold=-1,
inference_max_seq_length=2048,
inference_max_requests=self.batch_size,
fp32_residual_connection=False,
params_dtype=torch.float,
params_dtype=dtype,
padded_vocab_size=self.vocab_size,
)
......@@ -75,6 +78,8 @@ class TestTextGenerationController:
Utils.destroy_model_parallel()
def test_sample_from_logits(self):
self.setup_model(torch.float32)
with pytest.raises(AssertionError) as aerror:
self.text_generation_controller.sample_from_logits(
last_token_logits=None,
......@@ -138,27 +143,98 @@ class TestTextGenerationController:
sampled_logits >= expected_min_value
), f"The sampled logits should all be greater than {expected_min_value} but its {sampled_logits}"
def test_generate_all_output_tokens_static_batch(self):
@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
def test_generate_all_output_tokens_static_batch(self, dtype):
self.setup_model(dtype)
self.mock_tokenizer.vocab_size = self.vocab_size
self.mock_tokenizer.eod = self.vocab_size - 1
self.mock_tokenizer.detokenize.return_value = ''.join(
random.choices(string.ascii_letters, k=random.randint(4, 10))
self.mock_tokenizer.detokenize.side_effect = lambda x: ' '.join(
[
''.join(random.choices(string.ascii_letters, k=random.randint(4, 10)))
for _ in range(len(x))
]
)
self.mock_tokenizer.offsets.side_effect = lambda _, s: [
i for i, c in enumerate(s) if c == ' '
] + [len(s)]
active_requests: Dict[int, InferenceRequest] = OrderedDict()
active_requests: Dict[str, InferenceRequest] = OrderedDict()
all_prompt_tokens: Dict[str, List[int]] = OrderedDict()
for i in range(self.batch_size):
prompt = "sample" * (i + 1)
self.mock_tokenizer.tokenize.return_value = torch.randn(
self.batch_size, self.vocab_size
).cuda()
prompt_tokens = torch.randint(
low=0, high=self.vocab_size - 1, size=(len(prompt),)
).tolist()
request_id = str(i)
inference_request = InferenceRequest(
request_id=request_id,
prompt=prompt,
inference_parameters=SamplingParams(
num_tokens_to_generate=10, return_log_probs=True, return_segments=True
),
arrival_time=time.time(),
prompt_tokens=prompt_tokens,
status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
)
active_requests[request_id] = inference_request
all_prompt_tokens[request_id] = copy.deepcopy(prompt_tokens)
requests = self.text_generation_controller.generate_all_output_tokens_static_batch(
active_requests
)
for request_id, request in requests.items():
assert (
request.status == Status.COMPLETED
), f"Status should be completed but its {request.status}"
assert request.generated_length > 0, f"Generated length should be greater than zero"
assert request.generated_text is not None, "Generated text should not be None"
assert (
all_prompt_tokens[request_id] == request.prompt_tokens
), "Prompt tokens should not have changed during generation"
assert len(request.segments) == len(request.prompt_log_probs) + len(
request.generated_log_probs
), "Segments should be returned for both prompt and generated tokens"
assert len(request.prompt) + len(request.generated_text) == len(
request.text
), "Output text should include prompts and generations"
@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
def test_output_log_probs(self, dtype):
self.setup_model(dtype)
self.mock_tokenizer.vocab_size = self.vocab_size
self.mock_tokenizer.bos = 0
self.mock_tokenizer.eod = self.vocab_size - 1
self.mock_tokenizer.detokenize.side_effect = lambda x: ' '.join(
[
''.join(random.choices(string.ascii_letters, k=random.randint(4, 10)))
for _ in range(len(x))
]
)
self.mock_tokenizer.offsets.side_effect = lambda _, s: [
i for i, c in enumerate(s) if c == ' '
] + [len(s)]
prompt = ""
active_requests: Dict[int, InferenceRequest] = OrderedDict()
for i in range(self.batch_size):
self.mock_tokenizer.tokenize.return_value = torch.randn(
self.batch_size, self.vocab_size
).cuda()
inference_request = InferenceRequest(
request_id=i,
prompt=prompt,
inference_parameters=SamplingParams(num_tokens_to_generate=10),
inference_parameters=SamplingParams(
num_tokens_to_generate=1, return_log_probs=True
),
arrival_time=time.time(),
prompt_tokens=torch.randint(
low=0, high=self.vocab_size - 1, size=(len(prompt),)
).tolist(),
prompt_tokens=[self.mock_tokenizer.bos],
status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
)
active_requests[i] = inference_request
......@@ -173,3 +249,43 @@ class TestTextGenerationController:
), f"Status should be completed but its {request.status}"
assert request.generated_length > 0, f"Generated length should be greater than zero"
assert request.generated_text is not None, "Generated text should not be None"
assert len(request.generated_log_probs) == request.generated_length
def test_token_overflow(self):
self.setup_model(torch.float32)
self.mock_tokenizer.vocab_size = self.vocab_size
self.mock_tokenizer.bos = 0
self.mock_tokenizer.eod = self.vocab_size - 1
self.mock_tokenizer.detokenize.side_effect = lambda x: ' '.join(
[
''.join(random.choices(string.ascii_letters, k=random.randint(4, 10)))
for _ in range(len(x))
]
)
self.mock_tokenizer.offsets.side_effect = lambda _, s: [
i for i, c in enumerate(s) if c == ' '
] + [len(s)]
prompt = ""
active_requests: Dict[int, InferenceRequest] = OrderedDict()
for i in range(self.batch_size):
self.mock_tokenizer.tokenize.return_value = torch.randn(
self.batch_size, self.vocab_size
).cuda()
inference_request = InferenceRequest(
request_id=i,
prompt=prompt,
inference_parameters=SamplingParams(
num_tokens_to_generate=4096, return_log_probs=True
),
arrival_time=time.time(),
prompt_tokens=[self.mock_tokenizer.bos],
status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
)
active_requests[i] = inference_request
with pytest.raises(AssertionError):
requests = self.text_generation_controller.generate_all_output_tokens_static_batch(
active_requests
)
import copy
import os
import random
import string
import time
from argparse import Namespace
from collections import OrderedDict
from typing import Dict
from unittest import mock
import pytest
import torch
from megatron.core.inference.common_inference_params import CommonInferenceParams
from megatron.core.inference.inference_request import InferenceRequest, Status, VLMInferenceRequest
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import (
VLMInferenceWrapper,
)
from megatron.core.inference.text_generation_controllers.vlm_text_generation_controller import (
VLMTextGenerationController,
)
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
from megatron.core.models.multimodal.llava_model import LLaVAModel
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.enums import AttnBackend
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.legacy.model import Float16Module
from tests.unit_tests.test_utilities import Utils
class TestVLMTextGenerationController:
@pytest.mark.internal # The model is under active development and its methods may change.
def setup_method(self, method):
Utils.initialize_model_parallel(1, 1)
model_parallel_cuda_manual_seed(123)
self.language_hidden_size = 64
self.language_num_attention_heads = 4
self.language_vocab_size = 8192
self.language_max_sequence_length = 4096
self.img_h = 336
self.img_w = 336
language_config = TransformerConfig(
num_layers=3,
hidden_size=self.language_hidden_size,
num_attention_heads=self.language_num_attention_heads,
use_cpu_initialization=False,
)
vision_config = TransformerConfig(
num_layers=2, hidden_size=16, num_attention_heads=2, use_cpu_initialization=False
)
vision_projection_config = TransformerConfig(
num_layers=2,
hidden_size=self.language_hidden_size,
ffn_hidden_size=32,
num_attention_heads=1,
use_cpu_initialization=False,
)
language_layer_spec = get_gpt_layer_local_spec()
vision_layer_spec = copy.deepcopy(language_layer_spec)
vision_projection_spec = copy.deepcopy(language_layer_spec.submodules.mlp.submodules)
language_config.language_model_type = "dummy"
vision_config.vision_model_type = "clip"
self.model = LLaVAModel(
language_transformer_config=language_config,
language_transformer_layer_spec=language_layer_spec,
language_vocab_size=self.language_vocab_size,
language_max_sequence_length=self.language_max_sequence_length,
vision_transformer_config=vision_config,
vision_transformer_layer_spec=vision_layer_spec,
drop_vision_class_token=False,
vision_projection_config=vision_projection_config,
vision_projection_layer_spec=vision_projection_spec,
img_h=self.img_h,
img_w=self.img_w,
patch_dim=14,
).cuda()
self.image_token_index = self.model.image_token_index
self.model = Float16Module(self.model, Namespace(fp16=False, bf16=True))
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=self.language_hidden_size,
inference_batch_times_seqlen_threshold=-1,
fp32_residual_connection=False,
params_dtype=torch.float,
padded_vocab_size=self.language_vocab_size,
)
inference_wrapped_model = VLMInferenceWrapper(self.model, inference_wrapper_config)
self.mock_tokenizer = mock.Mock()
self.text_generation_controller = VLMTextGenerationController(
inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer
)
def teardown_method(self, method):
Utils.destroy_model_parallel()
def test_generate_all_output_tokens_static_batch(self):
self.mock_tokenizer.vocab_size = self.language_vocab_size
self.mock_tokenizer.eod = self.language_vocab_size - 1
self.mock_tokenizer.detokenize.return_value = ''.join(
random.choices(string.ascii_letters, k=random.randint(4, 10))
)
batch_size: int = 1
num_img_embeddings_per_tile: int = 576
imgs: torch.Tensor = torch.randn(1, 3, self.img_h, self.img_w).cuda()
num_tiles: torch.Tensor = torch.Tensor([1]).int()
decoder_seq_length: int = self.language_max_sequence_length
active_requests: Dict[str, InferenceRequest] = OrderedDict()
all_prompt_tokens: Dict[str, List[int]] = OrderedDict()
for i in range(batch_size):
prompt = "sample" * (i + 1)
self.mock_tokenizer.tokenize.return_value = torch.randn(
batch_size, self.language_vocab_size
).cuda()
prompt_tokens = torch.randint(
low=0, high=self.language_vocab_size - 1, size=(len(prompt),)
).tolist()
prompt_tokens[3] = self.image_token_index
request_id = str(i)
inference_request = VLMInferenceRequest(
request_id=request_id,
prompt=prompt,
inference_parameters=CommonInferenceParams(num_tokens_to_generate=10),
arrival_time=time.time(),
prompt_tokens=prompt_tokens,
num_img_embeddings_per_tile=num_img_embeddings_per_tile,
imgs=imgs,
num_tiles=num_tiles,
decoder_seq_length=decoder_seq_length,
status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
)
active_requests[request_id] = inference_request
all_prompt_tokens[request_id] = copy.deepcopy(prompt_tokens)
requests = self.text_generation_controller.generate_all_output_tokens_static_batch(
active_requests
)
for request_id, request in requests.items():
assert (
request.status == Status.COMPLETED
), f"Status should be completed but its {request.status}"
assert request.generated_length > 0, f"Generated length should be greater than zero"
assert request.generated_text is not None, "Generated text should not be None"
assert (
all_prompt_tokens[request_id] == request.prompt_tokens
), "Prompt tokens should not have changed during generation"
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import inspect
import os
import pytest
import torch
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.models.gpt.gpt_layer_specs import (
get_gpt_layer_with_transformer_engine_spec,
get_mlp_module_spec,
)
from megatron.core.models.gpt.gpt_model import GPTModel
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
......@@ -59,7 +63,7 @@ class TestGPTModel:
@pytest.mark.internal
def test_post_process_forward(self):
config: TransformerConfig = self.gpt_model.config
_ = self.gpt_model.config
sequence_length = self.gpt_model.max_sequence_length
micro_batch_size = 2
......@@ -79,3 +83,36 @@ class TestGPTModel:
assert logits.shape[0] == micro_batch_size
assert logits.shape[1] == sequence_length
assert logits.shape[2] == self.gpt_model.vocab_size
def test_get_mlp_module_spec_interface():
# Get the function signature
sig = inspect.signature(get_mlp_module_spec)
# Define the expected signature
expected_params = {
"use_te": inspect.Parameter.POSITIONAL_OR_KEYWORD,
"num_experts": inspect.Parameter.POSITIONAL_OR_KEYWORD,
"moe_grouped_gemm": inspect.Parameter.POSITIONAL_OR_KEYWORD,
"fp8": inspect.Parameter.POSITIONAL_OR_KEYWORD,
"moe_use_legacy_grouped_gemm": inspect.Parameter.POSITIONAL_OR_KEYWORD,
}
expected_defaults = {
"use_te": True,
"num_experts": None,
"moe_grouped_gemm": False,
"fp8": None,
"moe_use_legacy_grouped_gemm": False,
}
# Check parameter kinds
for param_name, param in sig.parameters.items():
assert param_name in expected_params.keys(), f"Unexpected parameter: {param_name}"
assert param.kind is expected_params[param_name], f"Wrong kind for parameter: {param_name}"
# Check default values
defaults = {
k: v.default for k, v in sig.parameters.items() if v.default is not inspect.Parameter.empty
}
assert defaults == expected_defaults, "Default values do not match the expected ones."
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from contextlib import nullcontext
from copy import deepcopy
from types import SimpleNamespace
......@@ -8,6 +9,7 @@ import torch
from megatron.core import InferenceParams
from megatron.core import parallel_state as ps
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.models.multimodal import context_parallel
from megatron.core.models.multimodal.llava_model import LLaVAModel
from megatron.core.models.vision.vit_layer_specs import get_vit_layer_with_transformer_engine_spec
from megatron.core.packed_seq_params import PackedSeqParams
......@@ -49,6 +51,7 @@ class TestLLaVAModel:
vision_layer_spec = deepcopy(language_layer_spec)
vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
language_config.language_model_type = "dummy"
vision_config.vision_model_type = "clip"
self.model = LLaVAModel(
language_transformer_config=language_config,
......@@ -131,7 +134,6 @@ class TestLLaVAModel:
use_inference_kv_cache = False
inference_params = None
image_token_mask = None
embeddings, labels, loss_mask = self.model._preprocess_data(
image_embeddings,
......@@ -143,7 +145,6 @@ class TestLLaVAModel:
inference_params,
image_token_index,
num_image_tiles,
image_token_mask,
)
img_seq_len = 577
......@@ -320,21 +321,27 @@ class TestLLaVAModel:
# Try with labels and PackedSeqParams. Only micro batch size 1 is supported in this mode.
packed_seq_params = PackedSeqParams(
qkv_format="thd",
cu_seqlens_q=[0, 512, 1024, 1600], # Just example values.
cu_seqlens_kv=[0, 512, 1024, 1600],
max_seqlen_q=[1600],
max_seqlen_kv=[1600],
cu_seqlens_q=torch.tensor(
[0, 512, 1024, 1600], dtype=torch.int32
).cuda(), # Just example values.
cu_seqlens_kv=torch.tensor([0, 512, 1024, 1600], dtype=torch.int32).cuda(),
max_seqlen_q=torch.tensor(1600, dtype=torch.int32).cuda(),
max_seqlen_kv=torch.tensor(1600, dtype=torch.int32).cuda(),
)
# NOTE: Packing is only supported with BF16. Use BF16 here and switch back to default.
self.model.to(torch.bfloat16)
loss, new_loss_mask = self.model.forward(
img[:1],
img[:1].to(torch.bfloat16),
input_ids[:1],
position_ids[:1],
attention_mask,
labels[:1],
loss_mask[:1],
num_image_tiles=num_image_tiles[:1],
packed_seq_params=packed_seq_params,
)
self.model.to(torch.float32)
# 1600 = 577 (img_seq_len) + 1024 (text tokens in the first sample) - 1 (image token).
assert loss.shape == new_loss_mask.shape == torch.Size((1, 1600))
......@@ -391,6 +398,49 @@ class TestLLaVAModel:
== torch.Size((max_seq_len, 5, self.language_num_attention_heads, 16))
)
@pytest.mark.internal
def test_forward_fsdp(self):
"""Test FSDP workaround for text-only data.
FSDP can hang with text-only data. As a workaround, we run the vision model with a dummy image,
but then effectively discard the image embeddings.
"""
self.model.cuda()
# Dummy image for the FSDP workaround but not image tiles.
img = torch.zeros((1, 3, 336, 336)).cuda()
num_image_tiles = torch.tensor([], dtype=torch.int).cuda()
# No image tag in the input ids (text-only sample).
image_token_index = self.model.image_token_index
input_ids = torch.arange(1024, device="cuda").unsqueeze(0)
assert (
torch.sum(input_ids == image_token_index) == 0
), "expected no image tag in the input ids"
position_ids = torch.arange(1024, device="cuda").unsqueeze(0)
loss_mask = torch.ones((1, 1024), device="cuda")
attention_mask = None # Causal.
labels = torch.arange(1, 1025, device="cuda").unsqueeze(0)
# Mock the FSDP attribute.
self.model.vision_model._is_fsdp_managed_module = True
loss, new_loss_mask = self.model.forward(
img,
input_ids,
position_ids,
attention_mask,
labels,
loss_mask,
num_image_tiles=num_image_tiles,
)
self.model.vision_model._is_fsdp_managed_module = False
assert loss.shape == new_loss_mask.shape == torch.Size((1, 1024))
@pytest.mark.internal
def test_save_load(self, tmp_path):
path = tmp_path / "model.pt"
......@@ -436,6 +486,7 @@ class TestLLaVAModelSigLIP:
vision_layer_spec = deepcopy(language_layer_spec)
vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
language_config.language_model_type = "dummy"
vision_config.vision_model_type = "siglip"
self.model = LLaVAModel(
language_transformer_config=language_config,
......@@ -482,19 +533,18 @@ def create_test_args(cp_size, sequence_parallel):
class TestLLaVAModelTokenParallel:
def init_llava_model(self):
self.language_hidden_size = 64
self.language_num_attention_heads = 16
def _init_llava_model(self, cp_size, tp_size, sequence_parallel):
language_hidden_size = 64
language_num_attention_heads = 16
language_config = TransformerConfig(
num_layers=3,
hidden_size=self.language_hidden_size,
num_attention_heads=self.language_num_attention_heads,
hidden_size=language_hidden_size,
num_attention_heads=language_num_attention_heads,
use_cpu_initialization=False,
tensor_model_parallel_size=self.tp_size,
sequence_parallel=self.sequence_parallel,
context_parallel_size=1, # Init with CP=1 until CI catches up to TEv1.10
# context_parallel_size=self.cp_size,
tensor_model_parallel_size=tp_size,
sequence_parallel=sequence_parallel,
context_parallel_size=cp_size,
)
# SP and CP are not yet supported for the Vision Backbone
vision_config = TransformerConfig(
......@@ -502,17 +552,17 @@ class TestLLaVAModelTokenParallel:
hidden_size=16,
num_attention_heads=8,
use_cpu_initialization=False,
tensor_model_parallel_size=self.tp_size,
tensor_model_parallel_size=tp_size,
sequence_parallel=False,
context_parallel_size=1,
)
vision_projection_config = TransformerConfig(
num_layers=2,
hidden_size=self.language_hidden_size,
ffn_hidden_size=1024,
hidden_size=language_hidden_size,
ffn_hidden_size=128,
num_attention_heads=8,
use_cpu_initialization=False,
tensor_model_parallel_size=self.tp_size,
tensor_model_parallel_size=tp_size,
sequence_parallel=False,
context_parallel_size=1,
)
......@@ -537,8 +587,9 @@ class TestLLaVAModelTokenParallel:
vision_layer_spec = deepcopy(language_layer_spec)
vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
language_config.language_model_type = "dummy"
vision_config.vision_model_type = "clip"
self.model = LLaVAModel(
model = LLaVAModel(
language_transformer_config=language_config,
language_transformer_layer_spec=language_layer_spec,
language_vocab_size=8192,
......@@ -553,7 +604,9 @@ class TestLLaVAModelTokenParallel:
patch_dim=14,
)
@pytest.mark.internal # The model is under active development and its methods may change.
return model
@pytest.mark.internal
def setup_method(self, method):
Utils.destroy_model_parallel()
......@@ -563,31 +616,46 @@ class TestLLaVAModelTokenParallel:
@pytest.mark.internal
@pytest.mark.parametrize(
"cp_size,tp_size,sequence_parallel", [(1, 8, True), (2, 4, False), (2, 4, True)]
"cp_size,tp_size,sequence_parallel,padding",
[(1, 8, True, True), (2, 4, False, True), (2, 4, True, False), (2, 4, True, True)],
)
def test_process_embedding_token_parallel(self, cp_size, tp_size, sequence_parallel):
self.cp_size = cp_size
self.tp_size = tp_size
self.sequence_parallel = sequence_parallel
def test_process_embedding_token_parallel(self, cp_size, tp_size, sequence_parallel, padding):
"""Test _process_embedding_token_parallel.
Note: This test requires TE version >= 1.10.0 to run properly.
"""
Utils.initialize_model_parallel(
tensor_model_parallel_size=self.tp_size, context_parallel_size=self.cp_size
tensor_model_parallel_size=tp_size, context_parallel_size=cp_size
)
model_parallel_cuda_manual_seed(123)
self.init_llava_model()
self.model.cuda()
# Setting CP size for LLM here as model init is done with CP=1 to
# avoid TE version check until CI catches up to TEv1.10
if self.cp_size > 1:
self.model.context_parallel_lm = self.cp_size
# TE version must be at least 1.10.0 if using context parallelism. Exit otherwise.
ctx = (
nullcontext()
if (is_te_min_version("1.10.0") or cp_size <= 1)
else pytest.raises(AssertionError)
)
model = None
with ctx:
model = self._init_llava_model(cp_size, tp_size, sequence_parallel)
args = create_test_args(self.cp_size, self.sequence_parallel)
if model is None:
return
model.cuda()
args = create_test_args(cp_size, sequence_parallel)
set_args(args)
batch_size = 2
if padding:
combined_valid_seqlen = 2049
combined_padded_seqlen = 2056
if self.cp_size > 1:
combined_padded_seqlen = 2064
else:
combined_valid_seqlen = 2048
combined_padded_seqlen = 2048
if cp_size > 1:
combined_embeddings = torch.ones(
[batch_size, combined_padded_seqlen, 4096], device='cuda', dtype=torch.bfloat16
) # [B, S, H]
......@@ -617,6 +685,20 @@ class TestLLaVAModelTokenParallel:
device=combined_embeddings.device,
)
qkv_format = 'sbhd' # Default format when not using padding
if cp_size > 1 and padding:
# Reshape from [B,S] to [1,T]
combined_embeddings = (
combined_embeddings.contiguous()
.view(combined_embeddings.shape[0] * combined_embeddings.shape[1], -1)
.unsqueeze(0)
)
new_labels = new_labels.view(new_labels.shape[0] * new_labels.shape[1]).unsqueeze(0)
new_loss_mask = new_loss_mask.view(
new_loss_mask.shape[0] * new_loss_mask.shape[1]
).unsqueeze(0)
qkv_format = 'thd'
packed_seq_params = PackedSeqParams(
cu_seqlens_q=cu_seqlens,
cu_seqlens_kv=cu_seqlens,
......@@ -624,41 +706,43 @@ class TestLLaVAModelTokenParallel:
cu_seqlens_kv_padded=cu_seqlens_padded,
max_seqlen_q=combined_padded_seqlen,
max_seqlen_kv=combined_padded_seqlen,
qkv_format='thd',
qkv_format=qkv_format,
)
combined_embeddings, new_labels, new_loss_mask, packed_seq_params = (
self.model._process_embedding_token_parallel(
model._process_embedding_token_parallel(
combined_embeddings, new_labels, new_loss_mask, packed_seq_params
)
)
# Calculate the expected padded seq length
if self.cp_size > 1 and self.sequence_parallel:
padding_factor = self.tp_size * self.cp_size * 2
elif self.cp_size > 1:
padding_factor = self.cp_size * 2
elif self.sequence_parallel:
padding_factor = self.tp_size
padded_seq_len = int(
(combined_padded_seqlen + (padding_factor - 1)) // padding_factor * padding_factor
)
# Check if output shape is as expected
if self.cp_size > 1 and self.sequence_parallel:
if cp_size > 1 and sequence_parallel:
if padding:
# THD format
assert combined_embeddings.shape[0] == batch_size * (
padded_seq_len / (self.tp_size * self.cp_size)
combined_padded_seqlen / (tp_size * cp_size)
)
assert combined_embeddings.shape[1] == 1
elif self.cp_size > 1:
else:
# SBHD format
assert combined_embeddings.shape[0] == (
combined_padded_seqlen / (tp_size * cp_size)
)
assert combined_embeddings.shape[1] == batch_size
elif cp_size > 1:
if padding:
# THD format
assert combined_embeddings.shape[0] == batch_size * (padded_seq_len / self.cp_size)
assert combined_embeddings.shape[0] == batch_size * (
combined_padded_seqlen / cp_size
)
assert combined_embeddings.shape[1] == 1
else:
# SBHD format
assert combined_embeddings.shape[0] == padded_seq_len / self.tp_size
assert combined_embeddings.shape[0] == (combined_padded_seqlen / cp_size)
assert combined_embeddings.shape[1] == batch_size
else:
# SBHD format
assert combined_embeddings.shape[0] == combined_padded_seqlen / tp_size
assert combined_embeddings.shape[1] == batch_size
......@@ -690,7 +774,7 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
model_parallel_cuda_manual_seed(123)
language_config = TransformerConfig(
num_layers=8,
num_layers=12,
hidden_size=language_hidden_size,
num_attention_heads=language_num_attention_heads,
use_cpu_initialization=False,
......@@ -718,6 +802,7 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
vision_layer_spec = get_vit_layer_with_transformer_engine_spec()
vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
language_config.language_model_type = "dummy"
vision_config.vision_model_type = "clip"
non_parallel_model = LLaVAModel(
language_transformer_config=language_config,
......@@ -762,7 +847,7 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
add_decoder = False if (pp_rank == 0 and epp == 1) else True
language_config = TransformerConfig(
num_layers=8,
num_layers=12,
hidden_size=language_hidden_size,
num_attention_heads=language_num_attention_heads,
use_cpu_initialization=False,
......@@ -790,6 +875,7 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
vision_layer_spec = get_vit_layer_with_transformer_engine_spec()
vision_projection_spec = deepcopy(vision_layer_spec.submodules.mlp.submodules)
language_config.language_model_type = "dummy"
vision_config.vision_model_type = "clip"
model = LLaVAModel(
language_transformer_config=language_config,
......@@ -895,3 +981,39 @@ def test_llava_model_parallelism(dtp, dpp, etp, epp):
Utils.destroy_model_parallel()
torch.cuda.empty_cache()
@pytest.mark.internal
@pytest.mark.parametrize(
"cp_size, tp_size, has_sp, seq_len, expected_padding",
[(1, 1, False, 99, 0), (2, 2, True, 99, 5), (2, 2, False, 99, 1)],
)
def test_get_padding(cp_size, tp_size, has_sp, seq_len, expected_padding):
"""Test calculating padding for context parallel."""
padding = context_parallel.get_padding(seq_len, cp_size, tp_size, has_sp)
assert padding == expected_padding
@pytest.mark.internal
@pytest.mark.parametrize(
"tokens, img_seq_len, padding_needed, cp_size, expected_seq_len",
[(torch.ones((1, 100)), 100, 0, 2, 200), (torch.ones((1, 100)), 128, 1, 2, 227)],
)
def test_get_packed_seq_params(tokens, img_seq_len, padding_needed, cp_size, expected_seq_len):
"""Test creating PackedSeqParams for context parallel."""
packed_seq_params = context_parallel.get_packed_seq_params(
tokens, img_seq_len, padding_needed, cp_size
)
assert torch.equal(
packed_seq_params.cu_seqlens_q, torch.tensor([0, expected_seq_len], dtype=torch.int32)
)
if padding_needed > 0:
padded_seq_len = tokens.shape[1] + img_seq_len
assert torch.equal(
packed_seq_params.cu_seqlens_q_padded,
torch.tensor([0, padded_seq_len], dtype=torch.int32),
)
assert packed_seq_params.max_seqlen_q == padded_seq_len
......@@ -3,7 +3,7 @@
import pytest
import torch
from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
from megatron.core.models.gpt.gpt_layer_specs import get_mlp_module_spec
from megatron.core.models.vision.multimodal_projector import MultimodalProjector
from megatron.core.tensor_parallel.layers import ColumnParallelLinear
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
......@@ -20,7 +20,7 @@ class TestMultimodalProjector:
transformer_config = TransformerConfig(
num_layers=1, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True
)
mlp_layer_spec = _get_mlp_module_spec().submodules
mlp_layer_spec = get_mlp_module_spec().submodules
affine_layer_spec = MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=None)
self.mlp = MultimodalProjector(
......
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
import pytest
import torch
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.models.vision.radio import RADIOViTModel
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
from tests.unit_tests.test_utilities import Utils
class TestRADIOViTModel:
"""Test RADIO ViT model."""
def setup_method(self, method):
Utils.initialize_model_parallel(1, 1)
model_parallel_cuda_manual_seed(123)
transformer_config = TransformerConfig(
num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True
)
transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec()
self.model = RADIOViTModel(
transformer_config,
transformer_layer_spec,
img_h=224,
img_w=224,
patch_dim=14,
add_class_token=False,
)
def teardown_method(self, method):
Utils.destroy_model_parallel()
def test_constructor(self):
assert isinstance(self.model, RADIOViTModel)
num_weights = sum([p.numel() for p in self.model.parameters()])
assert num_weights == 1501824
def test_set_input_tensor(self):
# [s, b, h] expected to the transformer.
expected_shape = (256, 2, 64)
input_tensor = torch.zeros(expected_shape)
self.model.set_input_tensor(input_tensor)
assert self.model.decoder.input_tensor.shape == torch.Size(expected_shape)
def test_forward(self):
self.model.cuda()
img = torch.zeros((2, 3, 224, 224)).cuda()
out = self.model.forward(img)
assert out.shape == torch.Size([2, 256, 64])
def test_save_load(self, tmp_path):
path = tmp_path / "model.pt"
torch.save(self.model.state_dict(), path)
self.model.load_state_dict(torch.load(path))
......@@ -107,6 +107,7 @@ class TestT5Model:
assert self.t5_model.encoder_hidden_state.shape[1] == micro_batch_size
assert self.t5_model.encoder_hidden_state.shape[2] == config.hidden_size
@pytest.mark.flaky_in_dev
def test_post_process_forward(self):
config: TransformerConfig = self.t5_model.config
sequence_length = self.t5_model.max_sequence_length
......@@ -156,6 +157,7 @@ class TestT5Model:
assert encoder_hidden_states.shape[1] == micro_batch_size
assert encoder_hidden_states.shape[2] == config.hidden_size
@pytest.mark.flaky_in_dev
def test_forward_output_encoder_hidden_only(self):
config: TransformerConfig = self.t5_model.config
sequence_length = self.t5_model.max_sequence_length
......@@ -191,6 +193,7 @@ class TestT5Model:
assert encoder_hidden_states.shape[1] == micro_batch_size
assert encoder_hidden_states.shape[2] == config.hidden_size
@pytest.mark.flaky_in_dev
def test_forward_with_encoder_hidden_states(self):
config: TransformerConfig = self.t5_model.config
sequence_length = self.t5_model.max_sequence_length
......
......@@ -269,3 +269,119 @@ def test_forward_backward_func_with_interleaving(mocker):
assert i['loss_reduced'] == j['loss_reduced']
Utils.destroy_model_parallel()
def test_forward_backward_func_with_uneven_interleaving(mocker):
from megatron.core.enums import ModelType
from megatron.core.pipeline_parallel import get_forward_backward_func
Utils.initialize_model_parallel(
tensor_model_parallel_size=1,
pipeline_model_parallel_size=4,
virtual_pipeline_model_parallel_size=2,
)
def forward_step_func(data_iterator, model):
import os
rank = int(os.environ['LOCAL_RANK'])
def loss_func(output_tensor):
return rank, {'loss_reduced': rank}
return torch.rand(512, 8, 256).cuda(), loss_func
model_a = torch.nn.Linear(4, 1)
model_b = torch.nn.Linear(8, 1)
def set_input_tensor(input_tensor):
return None
model_a.set_input_tensor = set_input_tensor
model_b.set_input_tensor = set_input_tensor
forward_backward_func = get_forward_backward_func()
assert (
schedule.get_forward_backward_func()
== schedule.forward_backward_pipelining_with_interleaving
)
sequence_length = 512
micro_batch_size = 8
hidden_size = 256
config = ModelParallelConfig(
pipeline_model_parallel_size=4, sequence_parallel=False, pipeline_dtype=torch.float
)
config.hidden_size = hidden_size
model_a.config = config
model_b.config = config
mocker.patch("megatron.core.pipeline_parallel.schedules.custom_backward", return_value=2)
with pytest.raises(RuntimeError):
model_a.model_type = ModelType.encoder_and_decoder
model_b.model_type = ModelType.encoder_and_decoder
forward_backward_func(
forward_step_func=forward_step_func,
data_iterator=[range(0, 100)],
model=[model_a, model_b],
num_microbatches=micro_batch_size,
seq_length=sequence_length,
micro_batch_size=micro_batch_size,
decoder_seq_length=sequence_length,
forward_only=True,
)
with pytest.raises(RuntimeError):
model_a.model_type = ModelType.encoder_or_decoder
model_b.model_type = ModelType.encoder_or_decoder
forward_backward_func(
forward_step_func=forward_step_func,
data_iterator=[range(0, 100)],
model=[model_a, model_b],
num_microbatches=micro_batch_size,
seq_length=sequence_length,
micro_batch_size=micro_batch_size,
decoder_seq_length=256,
forward_only=True,
)
with pytest.raises(RuntimeError):
model_a.model_type = ModelType.encoder_or_decoder
model_b.model_type = ModelType.encoder_or_decoder
forward_backward_func(
forward_step_func=forward_step_func,
data_iterator=[range(0, 100)],
model=[model_a, model_b],
num_microbatches=7,
seq_length=sequence_length,
micro_batch_size=micro_batch_size,
decoder_seq_length=512,
forward_only=True,
)
model_a.model_type = ModelType.encoder_or_decoder
model_b.model_type = ModelType.encoder_or_decoder
losses_reduced = forward_backward_func(
forward_step_func=forward_step_func,
data_iterator=[range(0, 100), range(0, 100)],
model=[model_a, model_b],
num_microbatches=micro_batch_size,
seq_length=sequence_length,
micro_batch_size=micro_batch_size,
decoder_seq_length=sequence_length,
forward_only=True,
)
loss_reduced_expected = [
{'loss_reduced': rank},
{'loss_reduced': rank},
{'loss_reduced': rank},
{'loss_reduced': rank},
]
for i, j in zip(losses_reduced, loss_reduced_expected):
print(losses_reduced)
assert i['loss_reduced'] == j['loss_reduced']
Utils.destroy_model_parallel()
import pathlib
import pytest
import yaml
YAML_DIR = pathlib.Path(__file__).parent / ".." / "functional_tests" / "test_cases"
def get_yaml_files(directory):
"""Retrieve all YAML files from the specified directory."""
return list([file for file in directory.rglob("*.yaml") if file is not None])
def load_yaml(file_path):
"""Load a YAML file and return its content as a Python dictionary."""
with open(file_path, "r") as f:
return yaml.safe_load(f)
@pytest.mark.parametrize(
"metric",
["--log-memory-to-tensorboard", "--log-num-zeros-in-grad", "--log-timers-to-tensorboard"],
)
@pytest.mark.parametrize("yaml_file", get_yaml_files(YAML_DIR))
def test_model_config_tracks_memory(yaml_file, metric):
"""Test if each YAML file contains the required record."""
print("gpt3-nemo" in str(yaml_file) or "ckpt_converter" in str(yaml_file))
if "gpt3-nemo" in str(yaml_file) or "ckpt_converter" in str(yaml_file):
pytest.skip("Skipping for gpt-nemo")
model_config = load_yaml(yaml_file)
assert (
"MODEL_ARGS" in model_config
and metric in model_config["MODEL_ARGS"]
and model_config["MODEL_ARGS"][metric] is True
), f"Please add argument `{metric}` to `{yaml_file.parent.name}/model_config.yaml` that its metric gets tracked."
import os
import pytest
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD, Adam
from megatron.core.optimizer import ChainedOptimizer
from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
from megatron.core.optimizer import ChainedOptimizer, OptimizerConfig, get_megatron_optimizer
from megatron.core.transformer import TransformerConfig
from tests.unit_tests.test_utilities import Utils
from tests.unit_tests.test_utils import _deinit_distributed, _init_distributed
class Net(nn.Module):
......@@ -111,3 +118,45 @@ def test_precision_aware_fused_adam():
bytes_2 = p_2.data.view(torch.uint8)
# Make sure bit-wise matched
assert torch.all(bytes_1 == bytes_2)
@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
@pytest.mark.parametrize("precision", ['bf16', 'fp32'])
def test_optim_sharded_state_dict(use_distributed_optimizer: bool, precision: str):
world = int(os.getenv('WORLD_SIZE', '1'))
rank = int(os.getenv('RANK', '0'))
# Setup: distributed, model, mock_args.
_init_distributed(world, rank)
Utils.initialize_model_parallel()
model = torch.nn.Linear(100, 100, bias=False, dtype=torch.bfloat16, device='cuda')
model.requires_grad_(True)
model.weight.data.fill_(1.0)
ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=use_distributed_optimizer)
model = DistributedDataParallel(
TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model
)
for param in model.parameters():
assert param.requires_grad
if precision == 'bf16':
optimizer_config = OptimizerConfig(
optimizer='adam', bf16=True, use_distributed_optimizer=use_distributed_optimizer
)
elif precision == 'fp32':
optimizer_config = OptimizerConfig(
optimizer='adam',
bf16=False,
fp16=False,
use_distributed_optimizer=use_distributed_optimizer,
)
optim = get_megatron_optimizer(optimizer_config, [model])
model_sharded_state_dict = model.sharded_state_dict()
sharded_state_dict = optim.sharded_state_dict(model_sharded_state_dict)
if 'optimizer' in sharded_state_dict and 'state' in sharded_state_dict['optimizer']:
assert (
'common_step' not in sharded_state_dict['optimizer']['state']
or sharded_state_dict['optimizer']['state']['common_step'] is not None
), "Found 'optimizer.state.common_step=None' in sharded state dict."
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
import random
import numpy as np
import pytest
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD, Adam
try:
from transformer_engine.pytorch.optimizers import FusedAdam as GPUAdam
from transformer_engine.pytorch.optimizers import FusedSGD as GPUSGD
except:
# Handle environment where transformer_engine is not installed
from torch.optim import SGD as GPUSGD
from torch.optim import Adam as GPUAdam
from megatron.core.optimizer.cpu_offloading import HybridDeviceOptimizer
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def setup_seed(seed):
random.seed(seed) # Set Python's built-in random seed
np.random.seed(seed) # Set NumPy's random seed
torch.manual_seed(seed) # Set PyTorch's CPU seed
torch.cuda.manual_seed(seed) # Set PyTorch's GPU seed (if using CUDA)
torch.cuda.manual_seed_all(seed) # Set seed for all GPUs
torch.backends.cudnn.deterministic = True # Ensure deterministic behavior
torch.backends.cudnn.benchmark = False # Disable auto-tuner for reproducibility
@pytest.mark.skipif(
torch.__version__ < '2.3.0',
reason=(
"Requires PyTorch 2.3.0 or higher, lower versions of pytorch have "
"misaligned optimizer accuracy for CPU and GPU."
),
)
@pytest.mark.parametrize('n_steps', [1, 10])
@pytest.mark.parametrize('overlap_cpu_optimizer_d2h_h2d', [False, True])
@pytest.mark.parametrize('offload_fraction', [0, 0.5, 1.0])
@pytest.mark.parametrize('optimizer', ['sgd', 'adam'])
@pytest.mark.parametrize('with_param_groups', [False, True])
def test_multi_device_hybrid_optimizer(
with_param_groups, optimizer, offload_fraction, overlap_cpu_optimizer_d2h_h2d, n_steps
):
setup_seed(42)
net1 = Net().cuda()
net2 = Net().cuda()
net2.load_state_dict(net1.state_dict())
base_lr = 1e-3
params = list(net1.parameters())
ref_params = list(net2.parameters())
if with_param_groups:
param_groups = [
{"params": params[: len(params) // 2], "wd_mult": 1.0, "lr_mult": 1e-4},
{"params": params[len(params) // 2 :], "wd_mult": 0.0, "lr_mult": 2e-4},
]
params = param_groups
ref_param_groups = [
{"params": ref_params[: len(ref_params) // 2], "wd_mult": 1.0, "lr_mult": 1e-4},
{"params": ref_params[len(ref_params) // 2 :], "wd_mult": 0.0, "lr_mult": 2e-4},
]
ref_params = ref_param_groups
if optimizer == 'adam':
cls_kwargs = dict(cpu_optimizer_cls=Adam, gpu_optimizer_cls=GPUAdam)
else:
cls_kwargs = dict(cpu_optimizer_cls=SGD, gpu_optimizer_cls=GPUSGD)
hdo = HybridDeviceOptimizer(
params,
offload_fraction=offload_fraction,
lr=base_lr,
overlap_cpu_optimizer_d2h_h2d=overlap_cpu_optimizer_d2h_h2d,
**cls_kwargs,
)
ref_optimizer = cls_kwargs['gpu_optimizer_cls'](ref_params, lr=base_lr)
# 1. run step on optimizer, make sure there is state generated
assert len(hdo.state_dict()["state"]) == 0 # state is empty
input = torch.randn(1, 3, 32, 32).cuda()
output = net1(input)
output.sum().backward()
hdo.step()
output = net2(input)
output.sum().backward()
ref_optimizer.step()
# PyTorch SGD will not generate state
if optimizer != 'sgd':
assert len(hdo.state_dict()["state"]) != 0
# 2. check the state is on right device
if optimizer == 'adam':
first_param_id = hdo.state_dict()["param_groups"][0]["params"][0]
last_param_id = hdo.state_dict()["param_groups"][-1]["params"][-1]
if offload_fraction > 0:
assert not hdo.state_dict()["state"][first_param_id]["exp_avg"].is_cuda
if offload_fraction < 1:
assert hdo.state_dict()["state"][last_param_id]["exp_avg"].is_cuda
# 3. check parameters allclose
for _ in range(1, n_steps):
input = torch.randn(1, 3, 32, 32).cuda()
output = net1(input)
output.sum().backward()
hdo.step()
output = net2(input)
output.sum().backward()
ref_optimizer.step()
params = net1.state_dict()
ref_params = net2.state_dict()
for k, v in params.items():
assert (v.isnan() == ref_params[k].isnan()).all()
torch.nan_to_num_(v, 0)
torch.nan_to_num_(ref_params[k], 0)
assert torch.allclose(
v, ref_params[k], atol=1e-03
), f"Weight {k} value mismatch, max error: {(v - ref_params[k]).abs().max()}"
......@@ -178,12 +178,17 @@ def test_encoder_tensor_pipeline_parallelism(order):
if rank < 2:
assert ps.get_tensor_model_parallel_world_size() == 3
assert isinstance(ps._PIPELINE_GLOBAL_RANKS[0], list)
last_ranks = ps.get_pipeline_model_parallel_last_rank()
assert isinstance(last_ranks, list)
assert len(last_ranks) == 2
elif rank == 2:
assert ps.get_tensor_model_parallel_world_size() == 3
assert isinstance(ps._PIPELINE_GLOBAL_RANKS[0], int)
assert isinstance(ps.get_pipeline_model_parallel_last_rank(), int)
else:
assert ps.get_tensor_model_parallel_world_size() == 5
assert isinstance(ps._PIPELINE_GLOBAL_RANKS[0], int)
assert isinstance(ps.get_pipeline_model_parallel_last_rank(), int)
Utils.destroy_model_parallel()
......@@ -255,6 +260,8 @@ def test_different_initialize_order_consistency(src_tp_pp, ep_size):
'src_tp_pp, ep_size',
[((1, 2), 1), ((1, 4), 1), ((2, 2), 1), ((1, 2), 2), ((1, 4), 2), ((2, 2), 2)],
)
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_different_initialize_order_unconsistency(src_tp_pp, ep_size):
Utils.initialize_model_parallel(
*src_tp_pp, expert_model_parallel_size=ep_size, order='tp-ep-dp-pp'
......
import os
import time
import urllib.request as req
from types import SimpleNamespace
import mock
import numpy as np
import pytest
import torch
import megatron.core.utils as util
import megatron.training.utils as training_util
from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
from megatron.core.transformer import TransformerConfig
from tests.unit_tests.test_utilities import Utils
......@@ -65,6 +71,7 @@ def _deinit_distributed():
torch.distributed.barrier()
@pytest.mark.flaky_in_dev
def test_check_param_hashes_across_dp_replicas():
world = int(os.getenv('WORLD_SIZE', '1'))
rank = int(os.getenv('RANK', '0'))
......@@ -72,7 +79,7 @@ def test_check_param_hashes_across_dp_replicas():
# Setup.
_init_distributed(world, rank)
Utils.initialize_model_parallel()
model = torch.nn.Linear(100, 100, bias=False)
model = torch.nn.Linear(100, 100, bias=False, device='cuda')
# First check case where all replicas agree.
model.weight.data.fill_(1.0)
......@@ -89,6 +96,7 @@ def test_check_param_hashes_across_dp_replicas():
_deinit_distributed()
@pytest.mark.flaky_in_dev
def test_cross_check_param_hashes_across_dp_replicas():
world = int(os.getenv('WORLD_SIZE', '1'))
rank = int(os.getenv('RANK', '0'))
......@@ -96,7 +104,7 @@ def test_cross_check_param_hashes_across_dp_replicas():
# Setup.
_init_distributed(world, rank)
Utils.initialize_model_parallel()
model = torch.nn.Linear(100, 100, bias=False)
model = torch.nn.Linear(100, 100, bias=False, device='cuda')
# First check case where all replicas agree.
model.weight.data.fill_(1.0)
......@@ -111,6 +119,57 @@ def test_cross_check_param_hashes_across_dp_replicas():
_deinit_distributed()
@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
@pytest.mark.flaky_in_dev
def test_param_norm(use_distributed_optimizer: bool):
world = int(os.getenv('WORLD_SIZE', '1'))
rank = int(os.getenv('RANK', '0'))
# Setup: distributed, model, mock_args.
_init_distributed(world, rank)
Utils.initialize_model_parallel()
model = torch.nn.Linear(100, 100, bias=False, dtype=torch.bfloat16, device='cuda')
model.requires_grad_(True)
model.weight.data.fill_(1.0)
ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=use_distributed_optimizer)
# Use dummy TransformerConfig which doesn't trigger __post_init__ assertions.
model = DistributedDataParallel(
TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model
)
for param in model.parameters():
assert param.requires_grad
mock_args = SimpleNamespace(bf16=True)
with mock.patch('megatron.training.utils.get_args', new=lambda: mock_args):
# Make sure norm is correct when `main_param` attribute is not available.
assert training_util.calc_params_l2_norm(
model, force_create_fp32_copy=False
) == pytest.approx(100.0)
assert training_util.calc_params_l2_norm(
model, force_create_fp32_copy=True
) == pytest.approx(100.0)
# Make sure norm is correct when `main_param` attribute is available.
optimizer_config = OptimizerConfig(
bf16=True, use_distributed_optimizer=use_distributed_optimizer
)
_ = get_megatron_optimizer(optimizer_config, [model])
for param in model.parameters():
assert hasattr(param, 'main_param')
if use_distributed_optimizer:
assert getattr(param, 'main_param_sharded', False)
assert training_util.calc_params_l2_norm(
model, force_create_fp32_copy=False
) == pytest.approx(100.0)
assert training_util.calc_params_l2_norm(
model, force_create_fp32_copy=True
) == pytest.approx(100.0)
# Teardown.
_deinit_distributed()
@pytest.mark.flaky_in_dev
def test_straggler_detector():
world = int(os.getenv('WORLD_SIZE', '1'))
rank = int(os.getenv('RANK', '0'))
......
......@@ -4,7 +4,10 @@ import pytest
import torch
from tests.unit_tests.test_utilities import Utils
from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestContainer
from tests.unit_tests.transformer.moe.test_token_dispatcher import (
MoEModelTestContainer,
permute_fusion_params,
)
def test_placeholder():
......@@ -12,7 +15,6 @@ def test_placeholder():
pass
@pytest.mark.flaky
class TestAlltoAllDispatcher:
def setup_method(self, method):
pass
......@@ -24,9 +26,8 @@ class TestAlltoAllDispatcher:
@pytest.mark.internal
@pytest.mark.timeout(120)
@pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_forward_backward(self, tp_size, ep_size):
@pytest.mark.parametrize("permute_fusion", permute_fusion_params)
def test_forward_backward(self, tp_size, ep_size, permute_fusion):
container = MoEModelTestContainer(
tp_size=tp_size,
ep_size=ep_size,
......@@ -35,6 +36,7 @@ class TestAlltoAllDispatcher:
moe_router_topk=2,
moe_router_load_balancing_type="aux_loss",
moe_token_dispatcher_type="alltoall",
moe_permute_fusion=permute_fusion,
)
container.dispatcher_dropless_test()
......@@ -42,8 +44,6 @@ class TestAlltoAllDispatcher:
@pytest.mark.internal
@pytest.mark.timeout(120)
@pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_a2aseq_forward_backward(self, tp_size, ep_size):
container = MoEModelTestContainer(
tp_size=tp_size,
......@@ -53,6 +53,7 @@ class TestAlltoAllDispatcher:
moe_router_topk=2,
moe_router_load_balancing_type="aux_loss",
moe_token_dispatcher_type="alltoall_seq",
moe_permute_fusion=False,
)
container.dispatcher_dropless_test()
......@@ -60,9 +61,8 @@ class TestAlltoAllDispatcher:
@pytest.mark.internal
@pytest.mark.timeout(120)
@pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_capacity_forward_backward(self, tp_size, ep_size):
@pytest.mark.parametrize("permute_fusion", permute_fusion_params)
def test_capacity_forward_backward(self, tp_size, ep_size, permute_fusion):
container = MoEModelTestContainer(
tp_size=tp_size,
ep_size=ep_size,
......@@ -74,6 +74,7 @@ class TestAlltoAllDispatcher:
moe_token_drop_policy="probs",
moe_expert_capacity_factor=0.5,
moe_pad_expert_input_to_capacity=False,
moe_permute_fusion=permute_fusion,
)
container.dispatcher_capacity_test()
......@@ -81,9 +82,8 @@ class TestAlltoAllDispatcher:
@pytest.mark.internal
@pytest.mark.timeout(120)
@pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_capacity_padding_forward_backward(self, tp_size, ep_size):
@pytest.mark.parametrize("permute_fusion", permute_fusion_params)
def test_capacity_padding_forward_backward(self, tp_size, ep_size, permute_fusion):
container = MoEModelTestContainer(
tp_size=tp_size,
ep_size=ep_size,
......@@ -95,5 +95,6 @@ class TestAlltoAllDispatcher:
moe_token_drop_policy="probs",
moe_expert_capacity_factor=0.6,
moe_pad_expert_input_to_capacity=True,
moe_permute_fusion=permute_fusion,
)
container.dispatcher_drop_and_pad_test()
......@@ -12,7 +12,7 @@ from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestC
class AuxlossTestContainer(MoEModelTestContainer):
def partition_input(self, input):
partitioned_input = input.chunk(
parallel_state.get_tensor_and_context_parallel_world_size(), dim=1
parallel_state.get_tensor_and_context_parallel_world_size(), dim=0
)[parallel_state.get_tensor_and_context_parallel_rank()]
output = partitioned_input.clone().detach()
output.requires_grad = True
......@@ -126,7 +126,9 @@ class TestSeqAuxLoss:
@pytest.mark.internal
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.parametrize("tp_size,ep_size,cp_size", [(1, 8, 1)])
@pytest.mark.parametrize(
"tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)]
)
def test_a2a_dispatcher(self, tp_size, ep_size, cp_size):
container = AuxlossTestContainer(
tp_size=tp_size,
......
......@@ -80,7 +80,7 @@ class TestMoELayerInit:
)
Utils.destroy_model_parallel()
@pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
@pytest.mark.parametrize("moe_token_dispatcher_type", ["alltoall", "allgather"])
@pytest.mark.parametrize("grouped_gemm", [True, False])
@pytest.mark.parametrize("tp_size,ep_size", [(1, 1), (2, 2)])
def test_moe_with_late_initialize(
......
......@@ -5,6 +5,7 @@ import torch
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
from megatron.core.transformer.moe.moe_layer import MoELayer
from megatron.core.transformer.moe.moe_utils import get_updated_expert_bias
from megatron.core.transformer.moe.router import Router
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.training.initialize import _set_random_seed
......@@ -47,12 +48,13 @@ class TestTop2Router:
@pytest.mark.internal
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.parametrize("moe_router_pre_softmax", [(True), (False)])
def test_router_forward(self, moe_router_pre_softmax):
@pytest.mark.parametrize("score_function", ["sigmoid", "softmax"])
def test_router_forward(self, moe_router_pre_softmax, score_function):
with torch.no_grad():
self.router = self.router.cuda()
self.router.config.moe_router_pre_softmax = moe_router_pre_softmax
self.router.config.moe_router_score_function = score_function
# [num tokens, hidden size]
hidden_states = torch.randn((32, 2, self.router.config.hidden_size))
hidden_states = hidden_states.cuda()
......@@ -60,7 +62,6 @@ class TestTop2Router:
@pytest.mark.internal
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
def test_aux_loss(self):
self.sequential_mlp = self.sequential_mlp.cuda()
......@@ -86,60 +87,149 @@ class TestTop2Router:
assert self.sequential_mlp.router.weight.grad.abs().sum() > 0
class TestDeviceLimitedTop2Router:
class TestGroupLimitedRouter:
def setup_method(self, method):
Utils.initialize_model_parallel(1, 1, expert_model_parallel_size=8)
Utils.initialize_model_parallel(
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1,
expert_model_parallel_size=8,
context_parallel_size=1,
)
_set_random_seed(seed_=123, data_parallel_random_init=False)
print("done intializing")
num_moe_experts = 8
num_moe_experts = 16
self.transformer_config = TransformerConfig(
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1,
expert_model_parallel_size=8,
context_parallel_size=1,
num_moe_experts=num_moe_experts,
moe_router_topk=4,
moe_router_group_topk=2,
moe_router_num_groups=8,
moe_router_pre_softmax=True,
moe_router_load_balancing_type="aux_loss",
moe_aux_loss_coeff=0,
moe_token_dispatcher_type="alltoall",
num_layers=2,
hidden_size=12,
num_attention_heads=4,
num_moe_experts=num_moe_experts,
use_cpu_initialization=True,
expert_model_parallel_size=8,
moe_router_load_balancing_type="aux_loss",
moe_router_topk_limited_devices=2,
moe_router_pre_softmax=True,
moe_router_topk=2,
moe_aux_loss_coeff=0,
)
# init MoE layer
transformer_layer_spec = get_gpt_layer_local_spec(
num_experts=num_moe_experts, moe_grouped_gemm=False
)
self.sequential_mlp = MoELayer(
self.moe_layer = MoELayer(
self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
)
self.router = self.sequential_mlp.router
).cuda()
self.router = self.moe_layer.router
def teardown_method(self, method):
Utils.destroy_model_parallel()
@pytest.mark.internal
def test_constructor(self):
assert isinstance(self.router, Router)
num_weights = sum([p.numel() for p in self.router.parameters()])
assert num_weights == 12 * 8, num_weights
assert (
num_weights
== self.transformer_config.hidden_size * self.transformer_config.num_moe_experts
), num_weights
@pytest.mark.internal
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("moe_router_group_topk,moe_router_num_groups", [(3, 8), (2, 4)])
@pytest.mark.parametrize("moe_router_pre_softmax", [(True), (False)])
def test_router_forward(self, moe_router_pre_softmax):
@pytest.mark.parametrize("score_function", ["sigmoid", "softmax"])
def test_router_forward(
self, moe_router_group_topk, moe_router_num_groups, moe_router_pre_softmax, score_function
):
with torch.no_grad():
self.router = self.router.cuda()
self.router.config.moe_router_group_topk = moe_router_group_topk
self.router.config.moe_router_num_groups = moe_router_num_groups
self.router.config.moe_router_pre_softmax = moe_router_pre_softmax
self.router.config.moe_router_score_function = score_function
if moe_router_pre_softmax:
self.router.config.moe_router_topk_scaling_factor = 16.0
# [num tokens, hidden size]
seq_len = 2
batch_size = 2
num_tokens = seq_len * batch_size
# hidden_states shape: [seq_len, batch_size, hidden_size]
hidden_states = torch.randn(
(seq_len, batch_size, self.router.config.hidden_size)
).cuda()
scores, routing_map = self.router(hidden_states)
assert scores.shape == (num_tokens, self.router.config.num_moe_experts), scores.shape
assert routing_map.shape == (
num_tokens,
self.router.config.num_moe_experts,
), routing_map.shape
group_routing_map = (
routing_map.reshape(num_tokens, moe_router_num_groups, -1).max(dim=-1).values
)
assert torch.all(group_routing_map.sum(dim=-1) <= moe_router_group_topk)
class TestAuxLossFreeTop2Router:
def setup_method(self, method):
Utils.initialize_model_parallel(1, 1, expert_model_parallel_size=8)
_set_random_seed(seed_=123, data_parallel_random_init=False)
print("done intializing")
num_moe_experts = 8
self.transformer_config = TransformerConfig(
num_layers=2,
hidden_size=12,
num_attention_heads=4,
num_moe_experts=num_moe_experts,
use_cpu_initialization=True,
expert_model_parallel_size=8,
moe_router_load_balancing_type="none", # No aux loss
moe_router_score_function="sigmoid", # Using sigmoid scoring
moe_router_enable_expert_bias=True, # Enable expert bias
moe_router_bias_update_rate=0.1, # Set bias update rate
moe_router_topk=2,
)
transformer_layer_spec = get_gpt_layer_local_spec(
num_experts=num_moe_experts, moe_grouped_gemm=False
)
self.moe_layer = MoELayer(
self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
)
self.router = self.moe_layer.router
assert self.router.expert_bias is not None
assert self.router.local_tokens_per_expert is not None
def teardown_method(self, method):
Utils.destroy_model_parallel()
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_router_forward_aux_free(self):
hidden_states = torch.randn((32, 2, self.router.config.hidden_size))
hidden_states = hidden_states.cuda()
scores, indices = self.router(hidden_states)
print(scores.shape, indices.shape)
assert scores.shape == (64, 8)
assert indices.shape == (64, 8)
print(
(indices == 0).sum(),
(indices == 1).sum(),
(indices == 2).sum(),
(indices == 3).sum(),
self.router = self.router.cuda()
# First forward pass
initial_bias = self.router.expert_bias.clone()
scores1, indices1 = self.router(hidden_states)
initial_tokens = self.router.local_tokens_per_expert.clone()
updated_bias = get_updated_expert_bias(
self.router.local_tokens_per_expert,
self.router.expert_bias,
self.router.config.moe_router_bias_update_rate,
)
# Verify expert bias was updated
assert not torch.equal(initial_bias, updated_bias), "Expert bias should be updated"
# Basic output checks
assert scores1.shape == (64, 8), "Router scores shape mismatch"
assert indices1.shape == (64, 8), "Router indices shape mismatch"
# Print some debug info
print("Updated bias after first forward pass:", updated_bias)
......@@ -8,8 +8,8 @@ import torch
from megatron.core import parallel_state
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
from megatron.core.transformer.moe.moe_layer import MoELayer
from megatron.core.transformer.moe.moe_utils import permute, unpermute
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.utils import is_te_min_version
from megatron.training.initialize import _set_random_seed
from tests.unit_tests.test_utilities import Utils
......@@ -69,6 +69,8 @@ class MoEModelTestContainer:
use_cpu_initialization=kwargs.get("use_cpu_initialization", True),
sequence_parallel=tp_size > 1,
add_bias_linear=kwargs.get("add_bias_linear", False),
moe_permute_fusion=kwargs.get("moe_permute_fusion", False),
moe_enable_deepep=kwargs.get("moe_enable_deepep", False),
)
# init moe layer
......@@ -94,31 +96,30 @@ class MoEModelTestContainer:
moe_layer = self.moe_layer
bs = 32
seql = 8
# TODO: Find why setting manual seed can cause the test to fail
# Manual seed to differentiate input data for each rank
# rank = torch.distributed.get_rank()
# torch.manual_seed(1000 + rank)
hidden_states = torch.randn((bs, seql, moe_layer.config.hidden_size))
hidden_states = hidden_states.cuda()
ans = hidden_states / 2
# Permute and then unpermute data are supposed to restore original data
ans = hidden_states
hidden_states.requires_grad = True
probs, indices = moe_layer.router(hidden_states)
probs = torch.ones_like(probs) / moe_layer.router.topk / 2
## Uncomment these lines to assist in bug location.
# hidden_states = torch.ones_like(hidden_states) * torch.distributed.get_rank()
# hidden_states.requires_grad = True
# indices = torch.ones_like(indices) * torch.distributed.get_rank()
# print(permuted_local_hidden_states)
probs = torch.ones_like(probs) / moe_layer.router.topk
(permuted_local_hidden_states, tokens_per_expert) = (
moe_layer.token_dispatcher.token_permutation(hidden_states, probs, indices)
)
scale = moe_layer.config.expert_tensor_parallel_size
permuted_local_hidden_states /= scale
restored_hidden_states, restored_bias = moe_layer.token_dispatcher.token_unpermutation(
permuted_local_hidden_states
)
# reduce across TP rank equals to multiply data by a scale of ETP
scale = moe_layer.config.expert_tensor_parallel_size
restored_hidden_states = restored_hidden_states / scale
assert torch.allclose(
restored_hidden_states, ans
), "Restored hidden states do not match original hidden states"
......@@ -147,8 +148,6 @@ class MoEModelTestContainer:
moe_layer.token_dispatcher.token_permutation(hidden_states, probs, indices)
)
print(f"Dispatched tokens per expert: {tokens_per_expert}")
permuted_local_hidden_states /= moe_layer.config.tensor_model_parallel_size
restored_hidden_states, restored_bias = moe_layer.token_dispatcher.token_unpermutation(
......@@ -220,6 +219,11 @@ class MoEModelTestContainer:
Utils.destroy_model_parallel()
permute_fusion_params = [False]
if is_te_min_version("1.14.0"):
permute_fusion_params.append(True)
class TestAllgatherDispatcher:
def setup_method(self, method):
pass
......@@ -231,9 +235,8 @@ class TestAllgatherDispatcher:
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.parametrize("tp_size,ep_size", [(8, 1), (1, 8), (2, 4), (1, 1)])
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_forward_backward(self, tp_size, ep_size):
@pytest.mark.parametrize("permute_fusion", permute_fusion_params)
def test_forward_backward(self, tp_size, ep_size, permute_fusion):
container = MoEModelTestContainer(
tp_size=tp_size,
ep_size=ep_size,
......@@ -242,6 +245,7 @@ class TestAllgatherDispatcher:
moe_router_topk=2,
moe_router_load_balancing_type="aux_loss",
moe_token_dispatcher_type="allgather",
moe_permute_fusion=permute_fusion,
)
container.dispatcher_dropless_test()
......@@ -249,12 +253,11 @@ class TestAllgatherDispatcher:
@pytest.mark.internal
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.parametrize("permute_fusion", permute_fusion_params)
@pytest.mark.parametrize(
"tp_size,ep_size,moe_tp_size", [(1, 1, 8), (1, 2, 4), (1, 4, 2), (2, 2, 4)]
)
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_moe_tp_forward_backward(self, tp_size, ep_size, moe_tp_size):
def test_moe_tp_forward_backward(self, tp_size, ep_size, moe_tp_size, permute_fusion):
container = MoEModelTestContainer(
tp_size=tp_size,
ep_size=ep_size,
......@@ -266,7 +269,93 @@ class TestAllgatherDispatcher:
moe_token_dispatcher_type="allgather",
sequence_parallel=True,
moe_grouped_gemm=True,
moe_permute_fusion=permute_fusion,
use_cpu_initialization=False,
)
container.dispatcher_dropless_test()
def is_deep_ep_available():
from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP
return HAVE_DEEP_EP
@pytest.mark.skipif(not is_deep_ep_available(), reason="Deep EP is not available")
class TestFlexDispatcher:
def setup_method(self, method):
pass
def teardown_method(self, method):
Utils.destroy_model_parallel()
@pytest.mark.internal
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.parametrize("tp_size,ep_size", [(8, 1), (1, 8), (2, 4)])
@pytest.mark.parametrize("permute_fusion", permute_fusion_params)
def test_forward_backward(self, tp_size, ep_size, permute_fusion):
container = MoEModelTestContainer(
tp_size=tp_size,
ep_size=ep_size,
pp_size=1,
num_moe_experts=8,
moe_router_topk=2,
moe_router_load_balancing_type="aux_loss",
moe_token_dispatcher_type="flex",
moe_permute_fusion=permute_fusion,
hidden_size=4,
moe_enable_deepep=True,
)
container.dispatcher_dropless_test()
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.timeout(120)
@pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2)])
@pytest.mark.parametrize("permute_fusion", permute_fusion_params)
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_capacity_forward_backward(self, tp_size, ep_size, permute_fusion):
container = MoEModelTestContainer(
tp_size=tp_size,
ep_size=ep_size,
pp_size=1,
num_moe_experts=8,
moe_router_topk=2,
moe_router_load_balancing_type="aux_loss",
moe_token_dispatcher_type="flex",
moe_token_drop_policy="probs",
moe_expert_capacity_factor=0.5,
moe_pad_expert_input_to_capacity=False,
moe_permute_fusion=permute_fusion,
hidden_size=4,
moe_enable_deepep=True,
)
container.dispatcher_capacity_test()
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.timeout(120)
@pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2)])
@pytest.mark.parametrize("permute_fusion", permute_fusion_params)
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_capacity_padding_forward_backward(self, tp_size, ep_size, permute_fusion):
container = MoEModelTestContainer(
tp_size=tp_size,
ep_size=ep_size,
pp_size=1,
num_moe_experts=8,
moe_router_topk=2,
moe_router_load_balancing_type="aux_loss",
moe_token_dispatcher_type="flex",
moe_token_drop_policy="probs",
moe_expert_capacity_factor=0.6,
moe_pad_expert_input_to_capacity=True,
moe_permute_fusion=permute_fusion,
hidden_size=4,
moe_enable_deepep=True,
)
container.dispatcher_drop_and_pad_test()
......@@ -38,6 +38,8 @@ class TestParallelAttention:
# we can't currently do this because the global memory buffer is on GPU
pass
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_gpu_forward(self):
config = self.parallel_attention.config
......@@ -62,6 +64,7 @@ class TestParallelAttention:
assert output.shape[2] == config.hidden_size
assert bias.shape[0] == config.hidden_size
@pytest.mark.flaky_in_dev
def test_fused_rope_gpu_forward(self):
self.parallel_attention.config.apply_rope_fusion = True
config = self.parallel_attention.config
......@@ -91,6 +94,7 @@ class TestParallelAttention:
assert bias.shape[0] == config.hidden_size
self.parallel_attention.config.apply_rope_fusion = False
@pytest.mark.flaky_in_dev
def test_checkpointed_gpu_forward(self):
transformer_config = self.transformer_config
transformer_config.recompute_granularity = 'selective'
......
......@@ -2,6 +2,7 @@
import os
from importlib.metadata import version
from inspect import signature
import pytest
import torch
......@@ -9,16 +10,19 @@ import transformer_engine as te
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.attention import Attention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.multi_latent_attention import MLASelfAttention
from megatron.core.transformer.multi_latent_attention import MLASelfAttention, MultiLatentAttention
from megatron.core.transformer.transformer_config import MLATransformerConfig
from megatron.core.utils import is_te_min_version
from tests.unit_tests.test_utilities import Utils
@pytest.mark.parametrize("rope_type", ('yarn', 'rope'))
class TestParallelMLAAttention:
def setup_method(self, method):
@pytest.fixture(scope='function', autouse=True)
def setup_and_teardown(self, rope_type):
Utils.initialize_model_parallel(1, 1)
model_parallel_cuda_manual_seed(123)
self.transformer_config = MLATransformerConfig(
......@@ -31,6 +35,7 @@ class TestParallelMLAAttention:
qk_head_dim=128,
v_head_dim=128,
qk_pos_emb_head_dim=64,
rope_type=rope_type,
rotary_base=10000,
max_position_embeddings=32,
)
......@@ -46,6 +51,19 @@ class TestParallelMLAAttention:
def teardown_method(self, method):
Utils.destroy_model_parallel()
def test_input_params_forward(self):
"""
Test to ensure that MultiLatentAttention has all parameters
required by the Attention class's forward method.
"""
# Extract parameters from the forward methods of both Attention and MultiLatentAttention
attn_params = set(signature(Attention.forward).parameters.keys())
mla_params = set(signature(MultiLatentAttention.forward).parameters.keys())
# Identify parameters that are in Attention but missing in MultiLatentAttention
missing_params = attn_params - mla_params
assert not missing_params, f"Missing parameters in MultiLatentAttention: {missing_params}"
def test_constructor(self):
assert isinstance(self.parallel_attention, MLASelfAttention)
assert self.parallel_attention.layer_number == 1
......@@ -59,11 +77,6 @@ class TestParallelMLAAttention:
def test_gpu_forward(self):
if is_te_min_version("1.10.0"):
# use flash attention for hopper, future may support fused attention for ampere
os.environ['NVTE_FUSED_ATTN'] = "0"
os.environ['NVTE_FLASH_ATTN'] = "1"
config = self.parallel_attention.config
sequence_length = 32
micro_batch_size = 2
......@@ -88,10 +101,6 @@ class TestParallelMLAAttention:
def test_checkpointed_gpu_forward(self):
if is_te_min_version("1.10.0"):
# use flash attention for hopper, future may support fused attention for ampere
os.environ['NVTE_FUSED_ATTN'] = "1"
os.environ['NVTE_FLASH_ATTN'] = "0"
transformer_config = self.transformer_config
transformer_config.recompute_granularity = 'selective'
checkpointed_parallel_attention = MLASelfAttention(
......@@ -128,3 +137,117 @@ class TestParallelMLAAttention:
assert output.shape[1] == micro_batch_size
assert output.shape[2] == config.hidden_size
assert bias.shape[0] == config.hidden_size
class TestSequenceParallelMLAAttention:
def setup_method(self, method):
self.tensor_parallel_size = 2
Utils.initialize_model_parallel(self.tensor_parallel_size, 1)
model_parallel_cuda_manual_seed(123)
self.transformer_config = MLATransformerConfig(
num_layers=2,
hidden_size=12,
num_attention_heads=4,
q_lora_rank=32,
kv_lora_rank=32,
qk_head_dim=128,
v_head_dim=128,
qk_pos_emb_head_dim=64,
rotary_base=10000,
max_position_embeddings=64,
tensor_model_parallel_size=self.tensor_parallel_size,
sequence_parallel=True,
)
self.parallel_attention = MLASelfAttention(
self.transformer_config,
get_gpt_layer_with_transformer_engine_spec(
multi_latent_attention=True
).submodules.self_attention.submodules,
layer_number=1,
attn_mask_type=AttnMaskType.causal,
)
def teardown_method(self, method):
Utils.destroy_model_parallel()
def test_gpu_forward(self):
if is_te_min_version("1.10.0"):
config = self.parallel_attention.config
sequence_length = 64
sub_sequence_length = sequence_length // self.tensor_parallel_size
micro_batch_size = 2
self.parallel_attention.cuda()
# [sequence length, batch size, hidden size]
hidden_states = torch.ones(
(sub_sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
)
hidden_states = hidden_states.cuda()
attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
output, bias = self.parallel_attention(hidden_states, attention_mask)
assert config.recompute_granularity is None
assert output.shape[0] == sub_sequence_length
assert output.shape[1] == micro_batch_size
assert output.shape[2] == config.hidden_size
assert bias.shape[0] == config.hidden_size
class TestTensorParallelMLAAttention:
def setup_method(self, method):
self.tensor_parallel_size = 2
Utils.initialize_model_parallel(self.tensor_parallel_size, 1)
model_parallel_cuda_manual_seed(123)
self.transformer_config = MLATransformerConfig(
num_layers=2,
hidden_size=12,
num_attention_heads=4,
q_lora_rank=32,
kv_lora_rank=32,
qk_head_dim=128,
v_head_dim=128,
qk_pos_emb_head_dim=64,
rotary_base=10000,
max_position_embeddings=64,
tensor_model_parallel_size=self.tensor_parallel_size,
sequence_parallel=False,
)
self.parallel_attention = MLASelfAttention(
self.transformer_config,
get_gpt_layer_with_transformer_engine_spec(
multi_latent_attention=True
).submodules.self_attention.submodules,
layer_number=1,
attn_mask_type=AttnMaskType.causal,
)
def teardown_method(self, method):
Utils.destroy_model_parallel()
def test_gpu_forward(self):
if is_te_min_version("1.10.0"):
config = self.parallel_attention.config
sequence_length = 64
micro_batch_size = 2
self.parallel_attention.cuda()
# [sequence length, batch size, hidden size]
hidden_states = torch.ones(
(sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
)
hidden_states = hidden_states.cuda()
attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
output, bias = self.parallel_attention(hidden_states, attention_mask)
assert config.recompute_granularity is None
assert output.shape[0] == sequence_length
assert output.shape[1] == micro_batch_size
assert output.shape[2] == config.hidden_size
assert bias.shape[0] == config.hidden_size
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment