Commit 64c0033e authored by liangjing's avatar liangjing
Browse files

update

parent 4b097dee
Pipeline #1849 passed with stage
...@@ -11,40 +11,18 @@ ...@@ -11,40 +11,18 @@
- [参考](#参考) - [参考](#参考)
# 环境配置 # 环境配置
1. 安装基础依赖包 1. 拉取合适镜像
<pre> <pre>
pip install -r requirements.txt docker pull nvcr.io/nvidia/pytorch:24.06-py3
</pre> </pre>
2. 安装DCU相关whl包
DCU相关包下载目录:[https://cancon.hpccube.com:65024/4/main](https://cancon.hpccube.com:65024/4/main) 2. 创建镜像并进入
pytorch whl包:pytorch ---> dtk-24.04.1
根据python版本,下载对应pytorch的whl包
<pre>
pip install torch* (下载的torch的whl包)
</pre>
torchvision whl包:vision ---> dtk-24.04.1
根据python版本,下载对应torchvision的whl包
<pre>
pip install torchvision* (下载的torchvision的whl包)
</pre>
apex whl包:apex ---> dtk-24.04.1
根据python版本,下载对应apex的whl包
<pre> <pre>
pip install apex* (下载的apex的whl包) docker run -it --name xx --gpus all --network=host --ipc=host --privileged -v /path_to_work/:/path_to_work/ --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
docker exec -it xx bash
</pre> </pre>
若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
3. 安装unsloth
<pre>
cd ./unsloth
pip3 install -e .
</pre>
# 下载词汇文件 # 下载词汇文件
......
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Megatron initialization."""
import logging
import random
import os
import time
import numpy as np
import torch
from datetime import timedelta
from megatron.legacy import fused_kernels
from megatron.training import get_adlr_autoresume
from megatron.training import get_args
from megatron.training import get_tensorboard_writer
from megatron.core import mpu, tensor_parallel
from megatron.training.arguments import parse_args, validate_args
from megatron.training.yaml_arguments import validate_yaml
from megatron.training.checkpointing import load_args_from_checkpoint
from megatron.training.global_vars import set_global_variables
from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
from megatron.core.fusions.fused_bias_gelu import bias_gelu
from megatron.core.fusions.fused_bias_swiglu import bias_swiglu
logger = logging.getLogger(__name__)
def initialize_megatron(
extra_args_provider=None,
args_defaults={},
ignore_unknown_args=False,
allow_no_cuda=False,
skip_mpu_initialization=False,
get_embedding_ranks=None,
get_position_embedding_ranks=None
):
"""Set global variables, initialize distributed, and
set autoresume and random seeds.
`allow_no_cuda` should not be set unless using megatron for cpu only
data processing. In general this arg should not be set unless you know
what you are doing.
Returns a function to finalize distributed env initialization
(optionally, only when args.lazy_mpu_init == True)
"""
if not allow_no_cuda:
# Make sure cuda is available.
assert torch.cuda.is_available(), "Megatron requires CUDA."
# Parse arguments
args = parse_args(extra_args_provider, ignore_unknown_args)
# Prep for checkpoint conversion.
if args.ckpt_convert_format is not None:
assert args.ckpt_convert_save is not None
assert args.load is not None
args.exit_on_missing_checkpoint = True
if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
assert args.load is not None, "--use-checkpoint-args requires --load argument"
load_args_from_checkpoint(args)
if args.yaml_cfg is not None:
args = validate_yaml(args, args_defaults)
else:
validate_args(args, args_defaults)
# set global args, build tokenizer, and set adlr-autoresume,
# tensorboard-writer, and timers.
set_global_variables(args)
# set logging level
setup_logging()
# torch.distributed initialization
def finish_mpu_init():
args = get_args()
# Pytorch distributed.
_initialize_distributed(get_embedding_ranks, get_position_embedding_ranks)
# Random seeds for reproducibility.
if args.rank == 0:
print("> setting random seeds to {} ...".format(args.seed))
_set_random_seed(args.seed, args.data_parallel_random_init)
if skip_mpu_initialization:
return None
args = get_args()
if args.lazy_mpu_init:
# TODO is this still a necessary option?
args.use_cpu_initialization = True
# delayed initialization of DDP-related stuff
# We only set basic DDP globals
mpu.set_tensor_model_parallel_world_size(args.tensor_model_parallel_size)
# and return function for external DDP manager
# to call when it has DDP initialized
mpu.set_tensor_model_parallel_rank(args.rank)
return finish_mpu_init
else:
# Megatron's MPU is the master. Complete initialization right away.
finish_mpu_init()
# Autoresume.
_init_autoresume()
# Compile dependencies.
_compile_dependencies()
if args.tp_comm_overlap:
_initialize_tp_communicators()
# No continuation function
return None
def _compile_dependencies():
args = get_args()
# =========================
# Compile dataset C++ code.
# =========================
# TODO: move this to ninja
if torch.distributed.get_rank() == 0:
start_time = time.time()
print("> compiling dataset index builder ...")
from megatron.core.datasets.utils import compile_helpers
compile_helpers()
print(
">>> done with dataset index builder. Compilation time: {:.3f} "
"seconds".format(time.time() - start_time),
flush=True,
)
# ==================
# Load fused kernels
# ==================
# Custom kernel constraints check.
seq_len = args.seq_length
attn_batch_size = (
args.num_attention_heads / args.tensor_model_parallel_size
) * args.micro_batch_size
# Constraints on sequence length and attn_batch_size to enable warp based
# optimization and upper triangular optimization (for causal mask)
custom_kernel_constraint = (
seq_len > 16
and seq_len <= 16384
and seq_len % 4 == 0
and attn_batch_size % 4 == 0
)
# Print a warning.
if not (
(args.fp16 or args.bf16)
and custom_kernel_constraint
and args.masked_softmax_fusion
):
if args.rank == 0:
print(
"WARNING: constraints for invoking optimized"
" fused softmax kernel are not met. We default"
" back to unfused kernel invocations.",
flush=True,
)
# Always build on rank zero first.
if torch.distributed.get_rank() == 0:
start_time = time.time()
print("> compiling and loading fused kernels ...", flush=True)
#fused_kernels.load(args)
torch.distributed.barrier()
else:
torch.distributed.barrier()
#fused_kernels.load(args)
# Simple barrier to make sure all ranks have passed the
# compilation phase successfully before moving on to the
# rest of the program. We think this might ensure that
# the lock is released.
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(
">>> done with compiling and loading fused kernels. "
"Compilation time: {:.3f} seconds".format(time.time() - start_time),
flush=True,
)
def _initialize_tp_communicators():
""" initializing the communicators with user buffers for high-performance tensor-model-parallel
communication overlap """
try:
import yaml
import transformer_engine
from transformer_engine.pytorch import module as te_module
except ImportError:
raise RuntimeError("Tensor Parallel Communication/GEMM Overlap optimization needs 'yaml' and "
"'transformer_engine' packages")
args = get_args()
if args.tp_comm_overlap_cfg is not None:
with open(args.tp_comm_overlap_cfg,"r") as stream:
ub_cfgs = yaml.safe_load(stream)
else:
ub_cfgs = {}
input_shape = [(args.seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size]
#We create a MPI process group, which is needed to bootstrap the pipelined
#tensor-model-parallel communication overlap
torch.distributed.new_group(backend='mpi')
te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,)
def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
"""Initialize torch.distributed and core model parallel."""
args = get_args()
device_count = torch.cuda.device_count()
if torch.distributed.is_initialized():
if args.rank == 0:
print(
"torch distributed is already initialized, "
"skipping initialization ...",
flush=True,
)
args.rank = torch.distributed.get_rank()
args.world_size = torch.distributed.get_world_size()
else:
if args.rank == 0:
print("> initializing torch distributed ...", flush=True)
# Manually set the device ids.
if device_count > 0:
#torch.cuda.set_device(args.local_rank)
#device_id = torch.device(f'cuda:{args.local_rank}')
device_id = args.rank % device_count
if args.local_rank is not None:
assert (
args.local_rank == device_id
), "expected local-rank to be the same as rank % device-count."
else:
args.local_rank = device_id
torch.cuda.set_device(device_id)
else:
device_id = None
# Call the init process
torch.distributed.init_process_group(
backend=args.distributed_backend,
world_size=args.world_size,
rank=args.rank,
init_method=args.dist_url,
timeout=timedelta(minutes=args.distributed_timeout_minutes),
)
#init_process_group_kwargs = {
# 'backend' : args.distributed_backend,
# 'world_size': args.world_size,
# 'rank': args.rank,
# 'timeout': timedelta(minutes=args.distributed_timeout_minutes),
#}
#torch.distributed.init_process_group(**init_process_group_kwargs)
# Set the tensor model-parallel, pipeline model-parallel, and
# data-parallel communicators.
if device_count > 0:
if mpu.model_parallel_is_initialized():
print("model parallel is already initialized")
else:
mpu.initialize_model_parallel(
args.tensor_model_parallel_size,
args.pipeline_model_parallel_size,
args.virtual_pipeline_model_parallel_size,
args.pipeline_model_parallel_split_rank,
context_parallel_size=args.context_parallel_size,
expert_model_parallel_size=args.expert_model_parallel_size,
distributed_timeout_minutes=args.distributed_timeout_minutes,
nccl_communicator_config_path=args.nccl_communicator_config_path,
order='tp-cp-ep-dp-pp' if not args.use_tp_pp_dp_mapping else 'tp-pp-dp',
encoder_tensor_model_parallel_size=args.encoder_tensor_model_parallel_size,
encoder_pipeline_model_parallel_size=args.encoder_pipeline_model_parallel_size,
get_embedding_ranks=get_embedding_ranks,
get_position_embedding_ranks=get_position_embedding_ranks,
)
if args.rank == 0:
print(
f"> initialized tensor model parallel with size "
f"{mpu.get_tensor_model_parallel_world_size()}"
)
print(
f"> initialized pipeline model parallel with size "
f"{mpu.get_pipeline_model_parallel_world_size()}"
)
def _init_autoresume():
"""Set autoresume start time."""
autoresume = get_adlr_autoresume()
if autoresume:
torch.distributed.barrier()
autoresume.init()
torch.distributed.barrier()
def _set_random_seed(seed_, data_parallel_random_init=False):
"""Set random seed for reproducability."""
if seed_ is not None and seed_ > 0:
# Ensure that different pipeline MP stages get different seeds.
seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
# Ensure different data parallel ranks get different seeds
if data_parallel_random_init:
seed = seed + (10 * mpu.get_data_parallel_rank())
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.device_count() > 0:
tensor_parallel.model_parallel_cuda_manual_seed(seed)
else:
raise ValueError("Seed ({}) should be a positive integer.".format(seed))
def write_args_to_tensorboard():
"""Write arguments to tensorboard."""
args = get_args()
writer = get_tensorboard_writer()
if writer:
for arg in vars(args):
writer.add_text(arg, str(getattr(args, arg)), global_step=args.iteration)
def set_jit_fusion_options():
"""Set PyTorch JIT layer fusion options."""
# flags required to enable jit fusion kernels
TORCH_MAJOR = int(torch.__version__.split(".")[0])
TORCH_MINOR = int(torch.__version__.split(".")[1])
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
# nvfuser
torch._C._jit_set_profiling_executor(True)
torch._C._jit_set_profiling_mode(True)
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_set_nvfuser_enabled(False) #True
torch._C._debug_set_autodiff_subgraph_inlining(False)
else:
# legacy pytorch fuser
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
torch._C._jit_override_can_fuse_on_cpu(True)
torch._C._jit_override_can_fuse_on_gpu(True)
_warmup_jit_function()
def _warmup_jit_function():
"""Compilie JIT functions before the main training steps"""
args = get_args()
if args.bf16:
dtype = torch.bfloat16
elif args.fp16:
dtype = torch.float16
else:
dtype = torch.float32
# Warmup fused bias+gelu
bias = torch.rand(
args.ffn_hidden_size // args.tensor_model_parallel_size,
dtype=dtype,
device="cuda",
)
input = torch.rand(
(
args.seq_length // args.context_parallel_size,
args.micro_batch_size,
args.ffn_hidden_size // args.tensor_model_parallel_size,
),
dtype=dtype,
device="cuda",
)
# Warmup JIT fusions with the input grad_enable state of both forward
# prop and recomputation
for bias_grad, input_grad in zip([True, True], [False, True]):
bias.requires_grad, input.requires_grad = bias_grad, input_grad
for _ in range(5):
if args.swiglu:
output = bias_swiglu(input, bias)
else:
output = bias_gelu(bias, input)
del bias, input, output
# Warmup fused bias+dropout+add
if args.sequence_parallel:
seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
else:
seq_length = args.seq_length
input = torch.rand(
(seq_length // args.context_parallel_size, args.micro_batch_size, args.hidden_size),
dtype=dtype,
device="cuda",
)
residual = torch.rand(
(seq_length // args.context_parallel_size, args.micro_batch_size, args.hidden_size),
dtype=dtype,
device="cuda",
)
bias = torch.rand((args.hidden_size), dtype=dtype, device="cuda").expand_as(
residual
)
dropout_rate = 0.1
# Warmup JIT fusions with the input grad_enable state of both forward
# prop and recomputation
for input_grad, bias_grad, residual_grad in zip(
[False, True], [True, True], [True, True]
):
input.requires_grad = input_grad
bias.requires_grad = bias_grad
residual.requires_grad = residual_grad
for _ in range(5):
output = bias_dropout_add_fused_train([input, bias], residual, dropout_rate)
del bias, input, residual, output
torch.cuda.empty_cache()
def setup_logging() -> None:
""" Sets the default logging level based on cmdline args and env vars.
Precedence:
1. Command line argument `--logging-level`
2. Env var `MEGATRON_LOGGING_LEVEL`
3. Default logging level (INFO)
Returns: None
"""
args = get_args()
logging_level = None
env_logging_level = os.getenv('MEGATRON_LOGGING_LEVEL', None)
if env_logging_level is not None:
logging_level = int(env_logging_level)
if args.logging_level is not None:
logging_level = args.logging_level
if logging_level is not None:
logger.info(f'Setting logging level to {logging_level}')
logging.getLogger().setLevel(logging_level)
...@@ -26,11 +26,9 @@ class RMSNorm(torch.nn.Module): ...@@ -26,11 +26,9 @@ class RMSNorm(torch.nn.Module):
setattr(self.weight, 'sequence_parallel', sequence_parallel) setattr(self.weight, 'sequence_parallel', sequence_parallel)
@torch.compile(mode="max-autotune-no-cudagraphs")
def _norm(self, x): def _norm(self, x):
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
@torch.compile(mode="max-autotune-no-cudagraphs")
def forward(self, x): def forward(self, x):
output = self._norm(x.float()).type_as(x) output = self._norm(x.float()).type_as(x)
return output * self.weight return output * self.weight
...@@ -40,9 +40,6 @@ from megatron.legacy.model.utils import ( ...@@ -40,9 +40,6 @@ from megatron.legacy.model.utils import (
) )
from megatron.training import get_args, get_timers from megatron.training import get_args, get_timers
import torch._dynamo
torch._dynamo.config.suppress_errors = True
from .module import MegatronModule from .module import MegatronModule
try: try:
...@@ -59,10 +56,6 @@ except ImportError: ...@@ -59,10 +56,6 @@ except ImportError:
) )
except ImportError: except ImportError:
flash_attn_unpadded_func = None flash_attn_unpadded_func = None
try:
from flash_attn.flash_attn_triton import flash_attn_func
except ImportError:
flash_attn_func = None
""" We use the following notation throughout this file: """ We use the following notation throughout this file:
h: hidden size h: hidden size
...@@ -140,7 +133,6 @@ class ParallelMLP(MegatronModule): ...@@ -140,7 +133,6 @@ class ParallelMLP(MegatronModule):
elif args.onnx_safe: elif args.onnx_safe:
self.activation_func = erf_gelu self.activation_func = erf_gelu
elif args.swiglu: elif args.swiglu:
@torch.compile(mode="max-autotune-no-cudagraphs")
def swiglu(x): def swiglu(x):
x = torch.chunk(x, 2, dim=-1) x = torch.chunk(x, 2, dim=-1)
return F.silu(x[0]) * x[1] return F.silu(x[0]) * x[1]
...@@ -164,7 +156,7 @@ class ParallelMLP(MegatronModule): ...@@ -164,7 +156,7 @@ class ParallelMLP(MegatronModule):
input_is_parallel=True, input_is_parallel=True,
is_expert=is_expert, is_expert=is_expert,
) )
@torch.compile(mode="max-autotune-no-cudagraphs")
def forward(self, hidden_states): def forward(self, hidden_states):
# [s, b, 4hp] # [s, b, 4hp]
...@@ -475,10 +467,6 @@ class FlashSelfAttention(torch.nn.Module): ...@@ -475,10 +467,6 @@ class FlashSelfAttention(torch.nn.Module):
self.softmax_scale = softmax_scale self.softmax_scale = softmax_scale
self.dropout_p = attention_dropout self.dropout_p = attention_dropout
# Use FlashAttention-2 when args.use_flash_attn_ck is True
args = get_args()
self.flash_attn_func = flash_attn_unpadded_func
def forward(self, q, k, v): def forward(self, q, k, v):
"""Implements the multihead softmax attention. """Implements the multihead softmax attention.
Arguments Arguments
...@@ -520,38 +508,6 @@ class FlashSelfAttention(torch.nn.Module): ...@@ -520,38 +508,6 @@ class FlashSelfAttention(torch.nn.Module):
output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
return output return output
class FlashSelfAttentionTriton(torch.nn.Module):
"""Implement the scaled dot product attention with softmax.
Arguments
---------
softmax_scale: The temperature to use for the softmax attention.
(default: 1/sqrt(d_keys) where d_keys is computed at
runtime)
attention_dropout: The dropout rate to apply to the attention
(default: 0.0)
"""
def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
device=None, dtype=None):
super().__init__()
assert flash_attn_func is not None, ('Triton version of FlashAttention is not installed.')
assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
self.causal = causal
self.softmax_scale = softmax_scale
self.dropout_p = attention_dropout
def forward(self, q, k, v):
"""Implements the multihead softmax attention.
Arguments
---------
q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
"""
assert q.dtype in [torch.float16, torch.bfloat16]
assert q.is_cuda
q, k, v = [rearrange(x, 's b h d -> b h s d').contiguous()
for x in (q, k, v)]
output = flash_attn_func(q, k, v, self.causal)
output = rearrange(output, 'b s h d -> h b (s d)').contiguous()
return output
class ParallelAttention(MegatronModule): class ParallelAttention(MegatronModule):
"""Parallel self-attention layer abstract class. """Parallel self-attention layer abstract class.
...@@ -580,19 +536,13 @@ class ParallelAttention(MegatronModule): ...@@ -580,19 +536,13 @@ class ParallelAttention(MegatronModule):
else: else:
kv_projection_size = args.kv_channels * args.num_attention_heads kv_projection_size = args.kv_channels * args.num_attention_heads
self.use_flash_attn = (args.use_flash_attn_ck or args.use_flash_attn_triton) \ self.use_flash_attn = args.use_flash_attn \
and attention_type == AttnType.self_attn \ and attention_type == AttnType.self_attn \
and self.attn_mask_type == AttnMaskType.causal and self.attn_mask_type == AttnMaskType.causal
self.use_flash_attn_triton = args.use_flash_attn_triton
if self.use_flash_attn: if self.use_flash_attn:
if args.use_flash_attn_ck: if flash_attn_unpadded_func is None:
if flash_attn_unpadded_func is None: raise ImportError('FlashAttention is not installed, please install with '
raise ImportError('FlashAttention is not installed, please install with '
'pip install flash-attn') 'pip install flash-attn')
if args.use_flash_attn_triton:
assert flash_attn_func != None, "Cannot import FlashAttention triton "
assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports ' assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
'self-attention for now') 'self-attention for now')
assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only ' assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
...@@ -652,10 +602,7 @@ class ParallelAttention(MegatronModule): ...@@ -652,10 +602,7 @@ class ParallelAttention(MegatronModule):
self.attn_mask_type) self.attn_mask_type)
self.checkpoint_core_attention = config.recompute_granularity == 'selective' self.checkpoint_core_attention = config.recompute_granularity == 'selective'
# Currently FlashAttention only works with causal mask if self.use_flash_attn:
if self.use_flash_attn_triton:
self.core_attention_flash = FlashSelfAttentionTriton(causal=True, attention_dropout=args.attention_dropout)
elif self.use_flash_attn:
self.core_attention_flash = FlashSelfAttention( self.core_attention_flash = FlashSelfAttention(
causal=True, attention_dropout=config.attention_dropout causal=True, attention_dropout=config.attention_dropout
) )
...@@ -763,7 +710,7 @@ class ParallelAttention(MegatronModule): ...@@ -763,7 +710,7 @@ class ParallelAttention(MegatronModule):
dim=3) dim=3)
# [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] - # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
query_layer = query_layer.contiguous().view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head) query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
else: else:
# Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
mixed_kv_layer, _ = self.key_value(encoder_output) mixed_kv_layer, _ = self.key_value(encoder_output)
...@@ -868,17 +815,14 @@ class ParallelAttention(MegatronModule): ...@@ -868,17 +815,14 @@ class ParallelAttention(MegatronModule):
context_layer = self.core_attention( context_layer = self.core_attention(
query_layer, key_layer, value_layer, attention_mask) query_layer, key_layer, value_layer, attention_mask)
else: else:
if not self.use_flash_attn_triton: q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous()
#q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
for x in (query_layer, key_layer, value_layer)] for x in (query_layer, key_layer, value_layer)]
if not self.sequence_parallel: if not self.sequence_parallel:
with tensor_parallel.get_cuda_rng_tracker().fork(): with tensor_parallel.get_cuda_rng_tracker().fork():
context_layer = self.core_attention_flash(query_layer, key_layer, value_layer) context_layer = self.core_attention_flash(q, k, v)
else: else:
context_layer = self.core_attention_flash(query_layer, key_layer, value_layer) context_layer = self.core_attention_flash(q, k, v)
if not self.use_flash_attn_triton: context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
# ================= # =================
# Output. [sq, b, h] # Output. [sq, b, h]
...@@ -1229,9 +1173,7 @@ class ParallelTransformerLayer(MegatronModule): ...@@ -1229,9 +1173,7 @@ class ParallelTransformerLayer(MegatronModule):
# hidden_states: [s, b, h] # hidden_states: [s, b, h]
# Layer norm at the beginning of the transformer layer. # Layer norm at the beginning of the transformer layer.
from unsloth.kernels.rms_layernorm import fast_rms_layernorm norm_output = self.input_norm(hidden_states)
norm_output = self.input_norm(hidden_states) if not args.use_fast_rms_layernorm else fast_rms_layernorm(self.input_norm, hidden_states)
#norm_output = self.input_norm(hidden_states)
# Self attention. # Self attention.
attention_output, attention_bias = \ attention_output, attention_bias = \
......
...@@ -60,7 +60,6 @@ def openai_gelu(x): ...@@ -60,7 +60,6 @@ def openai_gelu(x):
def erf_gelu(x): def erf_gelu(x):
return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
@torch.compile(mode="max-autotune-no-cudagraphs")
def get_norm(config): def get_norm(config):
args = get_args() args = get_args()
if args.normalization == "LayerNorm": if args.normalization == "LayerNorm":
......
...@@ -6,14 +6,9 @@ export OMP_NUM_THREADS=1 ...@@ -6,14 +6,9 @@ export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=SYS export NCCL_P2P_LEVEL=SYS
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export NCCL_IB_TIMEOUT=22 export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
...@@ -47,7 +42,7 @@ TRAINING_ARGS=( ...@@ -47,7 +42,7 @@ TRAINING_ARGS=(
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--use-distributed-optimizer --use-distributed-optimizer
--use-flash-attn-triton --use-flash-attn
--disable-bias-linear --disable-bias-linear
--attention-dropout 0 --attention-dropout 0
--hidden-dropout 0 --hidden-dropout 0
...@@ -57,8 +52,6 @@ TRAINING_ARGS=( ...@@ -57,8 +52,6 @@ TRAINING_ARGS=(
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 3.0e-6 --min-lr 3.0e-6
--lr-warmup-iters 1 --lr-warmup-iters 1
--use-fast-rms-layernorm
--use-fast-cross-entropy-loss
) )
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--sequence-parallel --sequence-parallel
...@@ -99,7 +92,7 @@ APP="python3 -u pretrain_gpt.py \ ...@@ -99,7 +92,7 @@ APP="python3 -u pretrain_gpt.py \
--dist_url tcp://${1}:34566 \ --dist_url tcp://${1}:34566 \
" "
#for hygon cpu #for intel cpu
case ${lrank} in case ${lrank} in
[0]) [0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
...@@ -107,31 +100,31 @@ case ${lrank} in ...@@ -107,31 +100,31 @@ case ${lrank} in
;; ;;
[1]) [1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[2]) [2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[3]) [3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[4]) [4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP} numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[5]) [5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP} numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[6]) [6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP} numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[7]) [7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP} numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
esac esac
......
unsloth @ a2f8db3e
Subproject commit a2f8db3e7341f983af5814a2c56f54fa29ee548d
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment