Commit ce02cd51 authored by dongcl's avatar dongcl
Browse files

Megatron v0.11.0

parent aeed6d97
...@@ -66,6 +66,10 @@ def unpermute( ...@@ -66,6 +66,10 @@ def unpermute(
): ):
``` ```
### 项目支持使用[flux kernel](http://10.6.10.68/dcutoolkit/deeplearing/flux)
在tp场景下,用户可以选择使用flux通算融合算子,获得更好的训练和推理性能。项目通过替换transformer engine方法集成flux,使用时需要设置环境变量USE_FLUX_OVERLAP=1,并设置transformer-impl为transformer_engine。
### 使用方式 ### 使用方式
在使用时,进入到examples目录下,有相关模型执行脚本,所用数据集请自行下载:https://r0ddbu55vzx.feishu.cn/drive/folder/ZxHHfCoX4lg75td2hTqcmiAin3g 在使用时,进入到examples目录下,有相关模型执行脚本,所用数据集请自行下载:https://r0ddbu55vzx.feishu.cn/drive/folder/ZxHHfCoX4lg75td2hTqcmiAin3g
``` ```
......
...@@ -99,7 +99,7 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -99,7 +99,7 @@ class CoreAdaptation(MegatronAdaptationABC):
) )
from ..core.models.gpt.gpt_model import ( from ..core.models.gpt.gpt_model import (
gpt_model_forward, gpt_model_forward,
gpt_model_init, gpt_model_init_wrapper,
shared_embedding_or_mtp_embedding_weight shared_embedding_or_mtp_embedding_weight
) )
from ..training.utils import get_batch_on_this_tp_rank from ..training.utils import get_batch_on_this_tp_rank
...@@ -116,20 +116,20 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -116,20 +116,20 @@ class CoreAdaptation(MegatronAdaptationABC):
# GPT Model # GPT Model
MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward', gpt_model_forward) MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward', gpt_model_forward)
MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__', gpt_model_init) MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__',
gpt_model_init_wrapper,
apply_wrapper=True)
from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.models.gpt.gpt_model import GPTModel
setattr(GPTModel, 'shared_embedding_or_mtp_embedding_weight', shared_embedding_or_mtp_embedding_weight) setattr(GPTModel, 'shared_embedding_or_mtp_embedding_weight', shared_embedding_or_mtp_embedding_weight)
def patch_core_transformers(self): def patch_core_transformers(self):
from ..core import transformer_block_init_wrapper, transformer_block_forward from ..core import transformer_block_init_wrapper
from ..core.transformer.transformer_config import TransformerConfigPatch, MLATransformerConfigPatch from ..core.transformer.transformer_config import TransformerConfigPatch, MLATransformerConfigPatch
# Transformer block # Transformer block
MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__', MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__',
transformer_block_init_wrapper) transformer_block_init_wrapper)
MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.forward',
transformer_block_forward)
# Transformer config # Transformer config
MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig', MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig',
...@@ -141,9 +141,9 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -141,9 +141,9 @@ class CoreAdaptation(MegatronAdaptationABC):
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity', MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity',
torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}), torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}),
apply_wrapper=True) apply_wrapper=True)
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func', # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}), # torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
apply_wrapper=True) # apply_wrapper=True)
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute', MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute',
torch.compile(mode='max-autotune-no-cudagraphs'), torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True) apply_wrapper=True)
...@@ -166,7 +166,6 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -166,7 +166,6 @@ class CoreAdaptation(MegatronAdaptationABC):
def patch_tensor_parallel(self): def patch_tensor_parallel(self):
from ..core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy from ..core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy
from ..core.tensor_parallel import vocab_parallel_embedding_forward, vocab_parallel_embedding_init from ..core.tensor_parallel import vocab_parallel_embedding_forward, vocab_parallel_embedding_init
from ..core.tensor_parallel import ColumnParallelLinearPatch, RowParallelLinearPatch, parallel_linear_init_wrapper
# VocabParallelEmbedding # VocabParallelEmbedding
MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward', MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward',
...@@ -188,17 +187,19 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -188,17 +187,19 @@ class CoreAdaptation(MegatronAdaptationABC):
apply_wrapper=True) apply_wrapper=True)
# flux # flux
MegatronAdaptation.register("megatron.core.tensor_parallel.layers.ColumnParallelLinear.__init__", if int(os.getenv("USE_FLUX_OVERLAP", "0")):
parallel_linear_init_wrapper, from ..core.tensor_parallel import (
apply_wrapper=True) FluxColumnParallelLinear,
MegatronAdaptation.register("megatron.core.tensor_parallel.layers.ColumnParallelLinear.forward", FluxRowParallelLinear
ColumnParallelLinearPatch.forward) )
MegatronAdaptation.register("megatron.core.tensor_parallel.layers.RowParallelLinear.__init__", from ..core.models.gpt.gpt_layer_specs import get_gpt_layer_with_flux_spec
parallel_linear_init_wrapper,
apply_wrapper=True)
MegatronAdaptation.register("megatron.core.tensor_parallel.layers.RowParallelLinear.forward",
RowParallelLinearPatch.forward)
MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TEColumnParallelLinear",
FluxColumnParallelLinear)
MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TERowParallelLinear",
FluxRowParallelLinear)
MegatronAdaptation.register("megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec",
get_gpt_layer_with_flux_spec)
def patch_training(self): def patch_training(self):
from ..training.tokenizer import build_tokenizer from ..training.tokenizer import build_tokenizer
...@@ -232,19 +233,22 @@ class LegacyAdaptation(MegatronAdaptationABC): ...@@ -232,19 +233,22 @@ class LegacyAdaptation(MegatronAdaptationABC):
self.patch_legacy_models() self.patch_legacy_models()
def patch_legacy_models(self): def patch_legacy_models(self):
from ..legacy.model.transformer import ParallelMLP, ParallelAttention from ..legacy.model.transformer import ParallelMLPPatch, ParallelAttentionPatch
from ..legacy.model.utils import get_norm
# ParallecMLP # ParallecMLP
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelMLP.__init__', MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelMLP.__init__',
ParallelMLP.__init__) ParallelMLPPatch.__init__)
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.forward', MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.forward',
ParallelAttention.forward) ParallelAttentionPatch.forward)
# rms_norm.RMSNorm # rms_norm.RMSNorm
MegatronAdaptation.register('megatron.legacy.model.rms_norm.RMSNorm.forward', MegatronAdaptation.register('megatron.legacy.model.rms_norm.RMSNorm.forward',
torch.compile(mode="max-autotune-no-cudagraphs"), torch.compile(mode="max-autotune-no-cudagraphs"),
apply_wrapper=True) apply_wrapper=True)
MegatronAdaptation.register('megatron.legacy.model.utils.get_norm',
get_norm)
MegatronAdaptation.execute() MegatronAdaptation.execute()
from .transformer.transformer_block import transformer_block_init_wrapper, transformer_block_forward from .transformer.transformer_block import transformer_block_init_wrapper
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
from typing import List from typing import List
import torch import torch
......
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
from typing import Literal from typing import Literal
import torch import torch
......
import warnings
from typing import Optional
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.multi_latent_attention import (
MLASelfAttention,
MLASelfAttentionSubmodules,
)
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_layer import (
TransformerLayer,
TransformerLayerSubmodules,
)
from dcu_megatron.core.tensor_parallel.layers import FluxColumnParallelLinear, FluxRowParallelLinear
from megatron.core.utils import is_te_min_version
try:
from megatron.core.extensions.transformer_engine import (
TEDotProductAttention,
TENorm,
)
except ImportError:
warnings.warn('transformer_engine is not installed.')
try:
import apex # pylint: disable=unused-import
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
except ImportError:
warnings.warn('Apex is not installed.')
def get_gpt_layer_with_flux_spec(
num_experts: Optional[int] = None,
moe_grouped_gemm: Optional[bool] = False,
qk_layernorm: Optional[bool] = False,
multi_latent_attention: Optional[bool] = False,
fp8: Optional[str] = None, # pylint: disable=unused-arguments
moe_use_legacy_grouped_gemm: Optional[bool] = False,
) -> ModuleSpec:
"""Use this spec to use flux modules (required for fp8 training).
Args:
num_experts (int, optional): Number of experts. Defaults to None.
moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
Defaults to False.
Returns:
ModuleSpec: Module specification with flux modules
"""
if fp8 is not None:
warnings.warn(
'The fp8 argument in "get_gpt_layer_with_transformer_engine_spec" has been deprecated'
' and will be removed soon. Please update your code accordingly.'
)
mlp = get_mlp_module_flux_spec(
use_te=False,
num_experts=num_experts,
moe_grouped_gemm=moe_grouped_gemm,
moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm,
)
if multi_latent_attention:
return ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=TENorm,
self_attention=ModuleSpec(
module=MLASelfAttention,
params={"attn_mask_type": AttnMaskType.causal},
submodules=MLASelfAttentionSubmodules(
linear_q_proj=FluxColumnParallelLinear,
linear_q_down_proj=FluxColumnParallelLinear,
linear_q_up_proj=FluxColumnParallelLinear,
linear_kv_down_proj=FluxColumnParallelLinear,
linear_kv_up_proj=FluxColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=FluxRowParallelLinear,
q_layernorm=TENorm if qk_layernorm else IdentityOp,
kv_layernorm=TENorm if qk_layernorm else IdentityOp,
),
),
self_attn_bda=get_bias_dropout_add,
pre_mlp_layernorm=TENorm,
mlp=mlp,
mlp_bda=get_bias_dropout_add,
),
)
else:
# TENorm significantly harms convergence when used
# for QKLayerNorm if TE Version < 1.9;
# we instead use the Apex implementation.
qk_norm = TENorm if is_te_min_version("1.9.0") else FusedLayerNorm
return ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=TENorm,
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": AttnMaskType.causal},
submodules=SelfAttentionSubmodules(
linear_qkv=FluxColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=FluxRowParallelLinear,
q_layernorm=qk_norm if qk_layernorm else IdentityOp,
k_layernorm=qk_norm if qk_layernorm else IdentityOp,
),
),
self_attn_bda=get_bias_dropout_add,
pre_mlp_layernorm=TENorm,
mlp=mlp,
mlp_bda=get_bias_dropout_add,
),
)
def get_mlp_module_flux_spec(
use_te: Optional[bool] = True,
num_experts: Optional[int] = None,
moe_grouped_gemm: Optional[bool] = False,
fp8: Optional[str] = None, # pylint: disable=unused-arguments
moe_use_legacy_grouped_gemm: Optional[bool] = False,
) -> ModuleSpec:
"""Helper function to get module spec for MLP/MoE"""
if fp8 is not None:
warnings.warn(
'The fp8 argument in "_get_mlp_module_spec" has been deprecated'
' and will be removed soon. Please update your code accordingly.'
)
if num_experts is None:
# Dense MLP w/ or w/o TE modules.
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=FluxColumnParallelLinear,
linear_fc2=FluxRowParallelLinear,
),
)
else:
# Mixture of experts with modules in megatron core.
return get_moe_module_spec(
use_te=True,
num_experts=num_experts,
moe_grouped_gemm=moe_grouped_gemm,
moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm,
)
import os
import logging import logging
from typing import Literal, Optional from typing import Literal, Optional
from functools import wraps from functools import wraps
...@@ -16,160 +17,76 @@ from megatron.core.packed_seq_params import PackedSeqParams ...@@ -16,160 +17,76 @@ from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.transformer.enums import ModelType from megatron.core.transformer.enums import ModelType
from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_block import TransformerBlock from megatron.core.transformer.transformer_block import TransformerBlock
from megatron.core.extensions.transformer_engine import TEColumnParallelLinear
from dcu_megatron.core.utils import tensor_slide from dcu_megatron.core.utils import tensor_slide
from dcu_megatron.core.transformer.mtp.multi_token_predictor import MultiTokenPredictor from dcu_megatron.core.transformer.mtp.multi_token_predictor import MultiTokenPredictor
from dcu_megatron.core.transformer.transformer_config import TransformerConfig from dcu_megatron.core.transformer.transformer_config import TransformerConfig
from dcu_megatron.core.tensor_parallel import FluxColumnParallelLinear
def gpt_model_init_wrapper(fn):
@wraps(fn)
def wrapper(self, *args, **kwargs):
fn(self, *args, **kwargs)
if (
self.post_process
and int(os.getenv("USE_FLUX_OVERLAP", "0"))
):
self.output_layer = FluxColumnParallelLinear(
self.config.hidden_size,
self.vocab_size,
config=self.config,
init_method=self.config.init_method,
bias=False,
skip_bias_add=False,
gather_output=not self.parallel_output,
skip_weight_param_allocation=self.pre_process
and self.share_embeddings_and_output_weights,
embedding_activation_buffer=self.embedding_activation_buffer,
grad_output_buffer=self.grad_output_buffer,
)
self.setup_embeddings_and_output_layer()
def gpt_model_init(
self, # add mtp
config: TransformerConfig, self.num_nextn_predict_layers = self.config.num_nextn_predict_layers
transformer_layer_spec: ModuleSpec, if self.num_nextn_predict_layers:
vocab_size: int, assert hasattr(self.config, "mtp_spec")
max_sequence_length: int, self.mtp_spec: ModuleSpec = self.config.mtp_spec
pre_process: bool = True, self.share_mtp_embedding_and_output_weight = self.config.share_mtp_embedding_and_output_weight
post_process: bool = True, self.recompute_mtp_norm = self.config.recompute_mtp_norm
fp16_lm_cross_entropy: bool = False, self.recompute_mtp_layer = self.config.recompute_mtp_layer
parallel_output: bool = True, self.mtp_loss_scale = self.config.mtp_loss_scale
share_embeddings_and_output_weights: bool = False, if self.post_process and self.training:
position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute', self.mtp_layers = torch.nn.ModuleList(
rotary_percent: float = 1.0, [
rotary_base: int = 10000, MultiTokenPredictor(
rope_scaling: bool = False, self.config,
rope_scaling_factor: float = 8.0, self.mtp_spec.submodules,
scatter_embedding_sequence_parallel: bool = True, vocab_size=self.vocab_size,
seq_len_interpolation_factor: Optional[float] = None, max_sequence_length=self.max_sequence_length,
mtp_spec: ModuleSpec = None layer_number=i,
) -> None: pre_process=self.pre_process,
super(GPTModel, self).__init__(config=config) fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
parallel_output=self.parallel_output,
if has_config_logger_enabled(config): position_embedding_type=self.position_embedding_type,
log_config_to_disk(config, locals(), prefix=type(self).__name__) rotary_percent=self.rotary_percent,
seq_len_interpolation_factor=seq_len_interpolation_factor,
self.transformer_layer_spec: ModuleSpec = transformer_layer_spec share_mtp_embedding_and_output_weight=self.share_mtp_embedding_and_output_weight,
self.vocab_size = vocab_size recompute_mtp_norm=self.recompute_mtp_norm,
self.max_sequence_length = max_sequence_length recompute_mtp_layer=self.recompute_mtp_layer,
self.pre_process = pre_process add_output_layer_bias=False
self.post_process = post_process )
self.fp16_lm_cross_entropy = fp16_lm_cross_entropy for i in range(self.num_nextn_predict_layers)
self.parallel_output = parallel_output ]
self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
self.position_embedding_type = position_embedding_type
# megatron core pipelining currently depends on model type
# TODO: remove this dependency ?
self.model_type = ModelType.encoder_or_decoder
# These 4 attributes are needed for TensorRT-LLM export.
self.max_position_embeddings = max_sequence_length
self.rotary_percent = rotary_percent
self.rotary_base = rotary_base
self.rotary_scaling = rope_scaling
if self.pre_process:
self.embedding = LanguageModelEmbedding(
config=self.config,
vocab_size=self.vocab_size,
max_sequence_length=self.max_sequence_length,
position_embedding_type=position_embedding_type,
scatter_to_sequence_parallel=scatter_embedding_sequence_parallel,
)
if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
self.rotary_pos_emb = RotaryEmbedding(
kv_channels=self.config.kv_channels,
rotary_percent=rotary_percent,
rotary_interleaved=self.config.rotary_interleaved,
seq_len_interpolation_factor=seq_len_interpolation_factor,
rotary_base=rotary_base,
rope_scaling=rope_scaling,
rope_scaling_factor=rope_scaling_factor,
use_cpu_initialization=self.config.use_cpu_initialization,
)
# Cache for RoPE tensors which do not change between iterations.
self.rotary_pos_emb_cache = {}
# Transformer.
self.decoder = TransformerBlock(
config=self.config,
spec=transformer_layer_spec,
pre_process=self.pre_process,
post_process=self.post_process
)
# Output
if post_process:
if self.config.defer_embedding_wgrad_compute:
# The embedding activation buffer preserves a reference to the input activations
# of the final embedding projection layer GEMM. It will hold the activations for
# all the micro-batches of a global batch for the last pipeline stage. Once we are
# done with all the back props for all the microbatches for the last pipeline stage,
# it will be in the pipeline flush stage. During this pipeline flush we use the
# input activations stored in embedding activation buffer and gradient outputs
# stored in gradient buffer to calculate the weight gradients for the embedding
# final linear layer.
self.embedding_activation_buffer = []
self.grad_output_buffer = []
else:
self.embedding_activation_buffer = None
self.grad_output_buffer = None
self.output_layer = tensor_parallel.ColumnParallelLinear(
config.hidden_size,
self.vocab_size,
config=config,
init_method=config.init_method,
bias=False,
skip_bias_add=False,
gather_output=not self.parallel_output,
skip_weight_param_allocation=self.pre_process
and self.share_embeddings_and_output_weights,
embedding_activation_buffer=self.embedding_activation_buffer,
grad_output_buffer=self.grad_output_buffer,
)
# add mtp
self.mtp_spec: ModuleSpec = mtp_spec
self.num_nextn_predict_layers = self.config.num_nextn_predict_layers
self.share_mtp_embedding_and_output_weight = self.config.share_mtp_embedding_and_output_weight
self.recompute_mtp_norm = self.config.recompute_mtp_norm
self.recompute_mtp_layer = self.config.recompute_mtp_layer
self.mtp_loss_scale = self.config.mtp_loss_scale
if self.post_process and self.training and self.num_nextn_predict_layers:
self.mtp_layers = torch.nn.ModuleList(
[
MultiTokenPredictor(
config,
self.mtp_spec.submodules,
vocab_size=self.vocab_size,
max_sequence_length=self.max_sequence_length,
layer_number=i,
pre_process=self.pre_process,
fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
parallel_output=self.parallel_output,
position_embedding_type=self.position_embedding_type,
rotary_percent=self.rotary_percent,
seq_len_interpolation_factor=seq_len_interpolation_factor,
share_mtp_embedding_and_output_weight=self.share_mtp_embedding_and_output_weight,
recompute_mtp_norm=self.recompute_mtp_norm,
recompute_mtp_layer=self.recompute_mtp_layer,
add_output_layer_bias=False
) )
for i in range(self.num_nextn_predict_layers)
]
)
if self.pre_process or self.post_process:
self.setup_embeddings_and_output_layer()
if has_config_logger_enabled(self.config): if self.pre_process or self.post_process:
log_config_to_disk( setup_mtp_embeddings(self)
self.config, self.state_dict(), prefix=f'{type(self).__name__}_init_ckpt'
)
if self.num_nextn_predict_layers and (self.pre_process or self.post_process): return wrapper
setup_mtp_embeddings(self)
def shared_embedding_or_mtp_embedding_weight(self) -> Tensor: def shared_embedding_or_mtp_embedding_weight(self) -> Tensor:
...@@ -424,10 +341,10 @@ def gpt_model_forward( ...@@ -424,10 +341,10 @@ def gpt_model_forward(
if ( if (
self.num_nextn_predict_layers self.num_nextn_predict_layers
and getattr(self.decoder, "final_layernorm", None) is not None and getattr(self.decoder, "main_final_layernorm", None) is not None
): ):
# move block main model final norms here # move block main model final norms here
hidden_states = self.decoder.final_layernorm(hidden_states) hidden_states = self.decoder.main_final_layernorm(hidden_states)
logits, _ = self.output_layer( logits, _ = self.output_layer(
hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
......
from .layers import ( from .layers import (
parallel_linear_init_wrapper, FluxColumnParallelLinear,
ColumnParallelLinearPatch, FluxRowParallelLinear,
RowParallelLinearPatch,
vocab_parallel_embedding_forward, vocab_parallel_embedding_forward,
vocab_parallel_embedding_init, vocab_parallel_embedding_init,
) )
\ No newline at end of file
This diff is collapsed.
# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. import os
import logging import logging
from dataclasses import dataclass from dataclasses import dataclass
from typing import Union, Optional, Literal from typing import Union, Optional, Literal
...@@ -11,6 +11,7 @@ from megatron.core.models.common.embeddings.language_model_embedding import Lang ...@@ -11,6 +11,7 @@ from megatron.core.models.common.embeddings.language_model_embedding import Lang
from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.module import MegatronModule
from megatron.core.extensions.transformer_engine import TEColumnParallelLinear
from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
from megatron.core.transformer import ModuleSpec, TransformerConfig, build_module from megatron.core.transformer import ModuleSpec, TransformerConfig, build_module
...@@ -136,18 +137,22 @@ class MultiTokenPredictor(MegatronModule): ...@@ -136,18 +137,22 @@ class MultiTokenPredictor(MegatronModule):
self.embedding_activation_buffer = None self.embedding_activation_buffer = None
self.grad_output_buffer = None self.grad_output_buffer = None
self.output_layer = tensor_parallel.ColumnParallelLinear( if int(os.getenv("USE_FLUX_OVERLAP", "0")):
config.hidden_size, column_parallel_linear_impl = FluxColumnParallelLinear
self.vocab_size, else:
config=config, column_parallel_linear_impl = tensor_parallel.ColumnParallelLinear
init_method=config.init_method, self.output_layer = column_parallel_linear_impl(
bias=self.add_output_layer_bias, self.config.hidden_size,
skip_bias_add=False, self.vocab_size,
gather_output=not self.parallel_output, config=self.config,
skip_weight_param_allocation=self.share_mtp_embedding_and_output_weight, init_method=self.config.init_method,
embedding_activation_buffer=self.embedding_activation_buffer, bias=False,
grad_output_buffer=self.grad_output_buffer, skip_bias_add=False,
) gather_output=not self.parallel_output,
skip_weight_param_allocation=self.share_mtp_embedding_and_output_weight,
embedding_activation_buffer=self.embedding_activation_buffer,
grad_output_buffer=self.grad_output_buffer,
)
def forward( def forward(
self, self,
......
from contextlib import nullcontext
from typing import Optional
from functools import wraps from functools import wraps
import torch
from torch import Tensor
from megatron.core import InferenceParams, parallel_state, tensor_parallel
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.utils import make_viewless_tensor
try:
from megatron.core.extensions.transformer_engine import TEDelayedScaling
HAVE_TE = True
except ImportError:
HAVE_TE = False
def transformer_block_init_wrapper(fn): def transformer_block_init_wrapper(fn):
@wraps(fn) @wraps(fn)
...@@ -25,178 +8,8 @@ def transformer_block_init_wrapper(fn): ...@@ -25,178 +8,8 @@ def transformer_block_init_wrapper(fn):
# mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block # mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
config = args[0] if len(args) > 1 else kwargs['config'] config = args[0] if len(args) > 1 else kwargs['config']
self.move_final_norm_out_of_block = getattr(config, "num_nextn_predict_layers", 0) > 0 if getattr(config, "num_nextn_predict_layers", 0) > 0:
self.main_final_layernorm = self.final_layernorm
self.final_layernorm = None
return wrapper return wrapper
def transformer_block_forward(
self,
hidden_states: Tensor,
attention_mask: Tensor,
context: Tensor = None,
context_mask: Tensor = None,
rotary_pos_emb: Tensor = None,
rotary_pos_cos: Tensor = None,
rotary_pos_sin: Tensor = None,
attention_bias: Tensor = None,
inference_params: InferenceParams = None,
packed_seq_params: PackedSeqParams = None,
sequence_len_offset: Tensor = None,
):
"""
Perform the forward pass through the transformer block.
This method handles the core computation of the transformer, including
self-attention, optional cross-attention, and feed-forward operations.
Args:
hidden_states (Tensor): Input tensor of shape [s, b, h] where s is the
sequence length, b is the batch size, and h is the hidden size.
attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
self-attention.
context (Tensor, optional): Context tensor for cross-attention.
context_mask (Tensor, optional): Mask for cross-attention context
rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
attention_bias (Tensor): Bias tensor for Q * K.T of shape in shape broadcastable
to [b, num_head, sq, skv], e.g. [1, 1, sq, skv].
Used as an alternative to apply attention mask for TE cuDNN attention.
inference_params (InferenceParams, optional): Parameters for inference-time
optimizations.
packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
processing.
Returns:
Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
[s, b, h], and optionally the updated context tensor if cross-attention is used.
"""
if not self.pre_process:
# See set_input_tensor()
hidden_states = self.input_tensor
# Update the inference parameters with the current batch size in case it is variable
if inference_params and not self.training:
inference_params.current_batch_size = hidden_states.size(1)
# Viewless tensor.
# - We only need to create a viewless tensor in the case of micro batch
# size (mbs) == 1, since in this case, 'hidden_states.transpose()'
# above creates a view tensor, and '.contiguous()' is a pass-through.
# For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
# the need to make it viewless.
#
# However, we don't explicitly check mbs == 1 here because
# make_viewless_tensor() has negligible overhead when its input
# is already viewless.
#
# - For the 'else' case above, calling make_viewless_tensor() here is
# likely redundant, since p2p_communication.py (likely originator)
# already creates viewless tensors. That said, make_viewless_tensor()
# is called here to be future-proof and corner-case-proof.
hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
if self.config.sequence_parallel:
rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
else:
rng_context = nullcontext()
if self.config.fp8:
import transformer_engine # To keep out TE dependency when not training in fp8
if self.config.fp8 == "e4m3":
fp8_format = transformer_engine.common.recipe.Format.E4M3
elif self.config.fp8 == "hybrid":
fp8_format = transformer_engine.common.recipe.Format.HYBRID
else:
raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
fp8_recipe = TEDelayedScaling(
config=self.config,
fp8_format=fp8_format,
override_linear_precision=(False, False, not self.config.fp8_wgrad),
)
fp8_group = None
if parallel_state.model_parallel_is_initialized():
fp8_group = parallel_state.get_amax_reduction_group(
with_context_parallel=True, tp_only_amax_red=self.tp_only_amax_red
)
fp8_context = transformer_engine.pytorch.fp8_autocast(
enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
)
else:
fp8_context = nullcontext()
with rng_context, fp8_context:
# Forward pass.
if self.config.recompute_granularity == 'full' and self.training:
hidden_states = self._checkpointed_forward(
hidden_states=hidden_states,
attention_mask=attention_mask,
context=context,
context_mask=context_mask,
rotary_pos_emb=rotary_pos_emb,
attention_bias=attention_bias,
packed_seq_params=packed_seq_params,
)
else:
for l_no, layer in enumerate(self.layers):
with self.offload_context:
layer.use_cudagraph = True
if (len(self.cuda_graphs) == 0) or (not self.training):
hidden_states, context = layer(
hidden_states=hidden_states,
attention_mask=attention_mask,
context=context,
context_mask=context_mask,
rotary_pos_emb=rotary_pos_emb,
rotary_pos_cos=rotary_pos_cos,
rotary_pos_sin=rotary_pos_sin,
attention_bias=attention_bias,
inference_params=inference_params,
packed_seq_params=packed_seq_params,
sequence_len_offset=sequence_len_offset,
)
else:
# CUDA graph replay for layer `l_no` and microbatch
# `self.current_microbatch`. TransformerEngine versions>=1.10
# allow keyword arguments with CUDA graph. However, CUDA graph
# acccepts only Tensor inputs and Tensor outputs. Hence,
# `inference_params` and `packed_seq_params` are excluded from
# input list while output is limited to `hidden_states`.
cg_index = self.current_microbatch % len(self.cuda_graphs[l_no])
assert not any(
[inference_params, packed_seq_params]
), "CUDA graph accepts only Tensor inputs."
optional_inputs = self.get_cuda_graph_optional_args(
attention_mask,
context,
context_mask,
rotary_pos_emb,
attention_bias,
inference_params,
packed_seq_params,
)
hidden_states = self.cuda_graphs[l_no][cg_index](
hidden_states, **optional_inputs
)
if (
torch.is_grad_enabled()
and self.config.cpu_offloading
and self.group_prefetch_offload_commit_async is not None
):
hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
# Final layer norm.
if (not self.move_final_norm_out_of_block) and self.final_layernorm is not None:
hidden_states = self.final_layernorm(hidden_states)
# TENorm produces a "viewed" tensor. This will result in schedule.py's
# deallocate_output_tensor() throwing an error, so a viewless tensor is
# created to prevent this.
hidden_states = make_viewless_tensor(
inp=hidden_states, requires_grad=True, keep_graph=True
)
return hidden_states
...@@ -26,9 +26,6 @@ class ExtraTransformerConfig: ...@@ -26,9 +26,6 @@ class ExtraTransformerConfig:
################## ##################
# flux # flux
################## ##################
use_flux: bool = False
"""If set, flux will be used in ColumnParallelLinear and RowParallelLinear"""
flux_transpose_weight: bool = False flux_transpose_weight: bool = False
......
import torch import torch
from typing import List, Optional, Union from typing import List, Optional, Union
from importlib.metadata import version
from packaging.version import Version as PkgVersion
_flux_version = None
def get_flux_version():
"""Get flux version from __version__; if not available use pip's. Use caching."""
def get_flux_version_str():
import flux
if hasattr(flux, '__version__'):
return str(flux.__version__)
else:
return version("flux")
global _flux_version
if _flux_version is None:
_flux_version = PkgVersion(get_flux_version_str())
return _flux_version
def is_flux_min_version(version, check_equality=True):
"""Check if minimum version of `flux` is installed."""
if check_equality:
return get_flux_version() >= PkgVersion(version)
return get_flux_version() > PkgVersion(version)
def tensor_slide( def tensor_slide(
......
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import torch
from torch import nn
class RMSNorm(torch.nn.Module):
def __init__(self,
dim: int,
eps: float = 1e-6,
sequence_parallel: bool = False,
config: dict = None):
"""RMS Normaliation module
Args:
dim (int): The width of input, i.e. hidden size
eps (float): epsilon to use for the norm, default to 1e-6
sequence_parallel (bool): Set to true if sequence parallelism is being used,
this marks the weights as needing to be allreduced.
"""
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.ones(dim))
setattr(self.weight, 'sequence_parallel', sequence_parallel)
def _norm(self, x):
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
@torch.compile(mode="max-autotune-no-cudagraphs")
def forward(self, x):
output = self._norm(x.float()).type_as(x)
return output * self.weight
import torch
from typing import Any, Callable, Dict, Optional, Tuple, Union
import lightop # rmsnorm_forward,rmsnorm_backward
from functools import partial
from megatron.core.utils import is_torch_min_version
if is_torch_min_version("2.4.0a0"):
custom_fwd = partial(torch.amp.custom_fwd, device_type="cuda")
custom_bwd = partial(torch.amp.custom_bwd, device_type="cuda")
else:
custom_fwd = torch.cuda.amp.custom_fwd
custom_bwd = torch.cuda.amp.custom_bwd
class _LightopRMSNorm(torch.autograd.Function):
""" 使用lightop实现rmsnorm"""
@staticmethod
# @custom_fwd
def forward(ctx,
inp: torch.Tensor,
weight: torch.Tensor,
ln_out: torch.Tensor,
eps: float,
is_grad_enabled):
output = lightop.rmsnorm_forward(inp, weight, ln_out, eps, training=True)# output = (output, weight)
rsigma = output[1]
if is_grad_enabled:
ctx.save_for_backward(inp, weight, rsigma)
return output[0]
@staticmethod
# @custom_bwd
def backward(ctx, grad_output):
inp, weight, rsigma = ctx.saved_tensors
dgrad, dgamma = lightop.rmsnorm_backward(grad_output, inp, rsigma, weight)
return dgrad, dgamma, None, None, None
class LightopRMSNorm(torch.nn.Module):
def __init__(self,
dim: int,
eps: float = 1e-6,):
"""RMS Normaliation module
Args:
dim (int): The width of input, i.e. hidden size
eps (float): epsilon to use for the norm, default to 1e-6
"""
super().__init__()
self.eps = eps
self.weight = torch.nn.Parameter(torch.ones(dim))
# @no_torch_dynamo() # 动态torch._dynamo.disable
def forward(self, inp: torch.Tensor, is_first_microbatch: Optional[bool] = None):
if torch.is_grad_enabled():
fwd_fn = _LightopRMSNorm.apply
args = []
else:
fwd_fn = _LightopRMSNorm.forward
args = [None]
ln_out = torch.empty_like(inp, dtype=inp.dtype, memory_format=torch.contiguous_format)
args += (inp, self.weight, ln_out, self.eps, torch.is_grad_enabled())
out = fwd_fn(*args)
return out
...@@ -3,14 +3,21 @@ import torch.nn.functional as F ...@@ -3,14 +3,21 @@ import torch.nn.functional as F
from megatron.training import get_args from megatron.training import get_args
from megatron.core import tensor_parallel from megatron.core import tensor_parallel
from megatron.legacy.model.enums import AttnType
from megatron.core.models.common.embeddings import apply_rotary_pos_emb
from megatron.legacy.model.module import MegatronModule from megatron.legacy.model.module import MegatronModule
from megatron.legacy.model.transformer import ParallelMLP
from megatron.legacy.model.utils import ( from megatron.legacy.model.utils import (
erf_gelu, erf_gelu,
openai_gelu, openai_gelu,
) )
try:
from einops import rearrange
except ImportError:
rearrange = None
class ParallelMLP(MegatronModule): class ParallelMLPPatch(MegatronModule):
"""MLP. """MLP.
MLP will take the input with h hidden state, project it to 4*h MLP will take the input with h hidden state, project it to 4*h
...@@ -74,7 +81,7 @@ class ParallelMLP(MegatronModule): ...@@ -74,7 +81,7 @@ class ParallelMLP(MegatronModule):
) )
class ParallelAttention(MegatronModule): class ParallelAttentionPatch(MegatronModule):
"""Parallel self-attention layer abstract class. """Parallel self-attention layer abstract class.
Self-attention layer takes input with size [s, b, h] Self-attention layer takes input with size [s, b, h]
......
from megatron.training import get_args
from megatron.legacy.model import LayerNorm
from .rms_norm import RMSNorm, LightopRMSNorm
def get_norm(config):
args = get_args()
if args.normalization == "LayerNorm":
return LayerNorm(
config.hidden_size,
eps=config.layernorm_epsilon,
no_persist_layer_norm=not config.persist_layer_norm,
sequence_parallel=config.sequence_parallel,
apply_layernorm_1p=args.apply_layernorm_1p)
elif args.normalization == "RMSNorm":
if args.apply_layernorm_1p:
raise NotImplementedError('RMSNorm does not currently support the layernorm_1p formulation.')
return RMSNorm(dim=config.hidden_size,
eps=config.layernorm_epsilon,
sequence_parallel=config.sequence_parallel)
elif args.normalization == "LightopRMSNorm":
return LightopRMSNorm(dim=config.hidden_size,
eps=config.layernorm_epsilon)
else:
raise Exception(f"unsupported norm type '{args.normalization}'.")
...@@ -51,6 +51,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): ...@@ -51,6 +51,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
# Standard arguments. # Standard arguments.
parser = _add_network_size_args(parser) parser = _add_network_size_args(parser)
parser = _add_extra_network_size_args(parser)
parser = _add_regularization_args(parser) parser = _add_regularization_args(parser)
parser = _add_training_args(parser) parser = _add_training_args(parser)
parser = _add_extra_training_args(parser) parser = _add_extra_training_args(parser)
...@@ -106,6 +107,18 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): ...@@ -106,6 +107,18 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
return args return args
def _add_extra_network_size_args(parser):
# 删除原参数
remove_original_params(parser, ["normalization"])
# 重定义参数
group = parser.add_argument_group(title='extra network size args')
group.add_argument('--normalization', default='LayerNorm',
choices=['LayerNorm', 'RMSNorm', 'LightopRMSNorm'],
help='Which normalization technique to use.')
return parser
def _add_extra_distributed_args(parser): def _add_extra_distributed_args(parser):
group = parser.add_argument_group(title='extra distributed args') group = parser.add_argument_group(title='extra distributed args')
group.add_argument('--rank', default=-1, type=int, group.add_argument('--rank', default=-1, type=int,
...@@ -169,9 +182,7 @@ def _add_mtp_args(parser): ...@@ -169,9 +182,7 @@ def _add_mtp_args(parser):
def _add_flux_args(parser): def _add_flux_args(parser):
group = parser.add_argument_group(title='multi token prediction') group = parser.add_argument_group(title='flux args')
group.add_argument('--use-flux', action='store_true', default=False,
help='If set, flux will be used in ColumnParallelLinear and RowParallelLinear')
group.add_argument('--flux-transpose-weight', action='store_true', default=False, group.add_argument('--flux-transpose-weight', action='store_true', default=False,
help='Whether to transpose weight when using flux kernel') help='Whether to transpose weight when using flux kernel')
return parser return parser
#!/bin/bash
# set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=32 # 20
export NCCL_MAX_NCHANNELS=32 # 20
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export SENDRECV_STREAM_WITH_COMPUTE=1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# export HIPBLASLT_LOG_LEVEL=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export cache_size_limit=64
# lightop算子库
export PYTORCH_ROCM_ARCH='gfx906;gfx926;gfx936'
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/public/home/gmhtest_tmp/RedPajama-Data-1T-Sample/redpajama_text_document" #<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=(
--num-layers 80 #80 #80 #40 # 20 #
--hidden-size 8192
--ffn-hidden-size 22016 # 28672
--num-attention-heads 64
--max-position-embeddings 8192
--group-query-attention
--num-query-groups 8
--normalization RMSNorm
--position-embedding-type rope
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
)
export NVTE_FLASH_ATTN=1 # 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS=(
--transformer-impl local # 走legacy用这两组参数
--use-legacy-models
--micro-batch-size 1
--global-batch-size 512 #32 #512 #256 # 64 #240 #60 #512 #64
--train-iters 300
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
# --no-gradient-accumulation-fusion
# --no-check-for-nan-in-loss-and-grad
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--ckpt-format torch
--ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-activations
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 1 #0 #
# --recompute-method block
--overlap-grad-reduce # 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 启动core
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 启动core
--use-flash-attn
)
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 8
--context-parallel-size 1
# --num-layers-per-virtual-pipeline-stage 1
# --microbatch-group-size-per-virtual-pipeline-stage 5
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS=(
--data-path $DATA_PATH
--seq-length 4096 #8192 #4096
--split 949,50,1
--tokenizer-type Llama2Tokenizer
--tokenizer-model /public/home/gmhtest_tmp/RedPajama-Data-1T-Sample/tokenizer.model
# --tokenizer-model /data/model_weights/llama3.2/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--log-throughput
--save-interval 500
--eval-interval 50
--eval-iters 3
--save $SAVE_PATH
--load $SAVE_PATH
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS=(
--profile
--profile-step-start 4
--profile-step-end 5
--use-pytorch-profiler
--profile-ranks 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
--profile-dir prof_data
)
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=34577
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
APP="python -u ../../pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case ${LOCAL_RANK} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
#!/bin/bash
# set -eux
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
#default env
#export FLASH_ATTENTION_PRINT_PARAM=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
# export GPU_MAX_HW_QUEUES=10
#export HIP_ALLOC_INITIALIZE=0
export CUDA_DEVICE_MAX_CONNECTIONS=1
# nccl env
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=32 # 20
export NCCL_MAX_NCHANNELS=32 # 20
export NCCL_IB_TIMEOUT=22
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# hipblaslt库
export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
# rocblas
export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export SENDRECV_STREAM_WITH_COMPUTE=1
#增加编译缓存
export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=(
--num-layers 32
--hidden-size 4096
--ffn-hidden-size 11008
--num-attention-heads 32
--max-position-embeddings 4096
--normalization RMSNorm # LightopRMSNorm
--position-embedding-type rope # none #
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
)
export NVTE_FLASH_ATTN=1 # 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS=(
--transformer-impl local # 走legacy用这两组参数
--use-legacy-models
--micro-batch-size 1
--global-batch-size 256 #256 #240 #60 #512 #64
--train-iters 50
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
# --no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--ckpt-format torch
--ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce # 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
# --context-parallel-size 2
# --num-layers-per-virtual-pipeline-stage 4
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS=(
--data-path $DATA_PATH
--seq-length 4096 #4096
--split 949,50,1
--tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
--log-throughput
--eval-iters 50
--log-interval 1
--save-interval 1000
--eval-interval 1000
--save $SAVE_PATH
--load $SAVE_PATH
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS=(
--profile
--profile-step-start 4
--profile-step-end 5
--use-pytorch-profiler
--profile-ranks 0 1 2 3 4 5 6 7
--profile-dir prof_data
)
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=34577
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case ${LOCAL_RANK} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
#!/bin/bash
# set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=32 # 20
export NCCL_MAX_NCHANNELS=32 # 20
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export SENDRECV_STREAM_WITH_COMPUTE=1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=(
--num-layers 126 #96 #8 # 126
--hidden-size 16384
--ffn-hidden-size 53248
--num-attention-heads 128
--max-position-embeddings 16384
--group-query-attention
--num-query-groups 16
--normalization RMSNorm
--position-embedding-type rope
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
)
export NVTE_FLASH_ATTN=1 # 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS=(
--transformer-impl transformer_engine # 走core用这两组参数
--use-mcore-models
--micro-batch-size 1
--global-batch-size 6912 # 252 #32 # 64 #240 #60 #512 #64
--train-iters 100
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
# --no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--ckpt-format torch
--ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce # 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 优化项未适配
--use-flash-attn-cutlass
)
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 18 # 7 layer/gpu
--context-parallel-size 2
)
DATA_ARGS=(
--data-path $DATA_PATH
--seq-length 4096 #4096
--split 949,50,1
--tokenizer-type Llama3Tokenizer
--tokenizer-model /public/home/wangxj/Downloads/model_weights/llama3.2/tokenizer.model
# --tokenizer-model /data/model_weights/llama3.2/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--log-throughput
--save-interval 1000
--eval-interval 1000
--save $SAVE_PATH
--load $SAVE_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS=(
--profile
--profile-step-start 4
--profile-step-end 5
--use-pytorch-profiler
--profile-ranks 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
--profile-dir prof_data
)
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=34577
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
APP="python -u pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
${APP}
# case ${LOCAL_RANK} in
# [0])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [1])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=1 --membind=1 ${APP}
# ;;
# [2])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=2 --membind=2 ${APP}
# ;;
# [3])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=3 --membind=3 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [4])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=4 --membind=4 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [5])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=5 --membind=5 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [6])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=6 --membind=6 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [7])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=7 --membind=7 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# esac
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment