Commit 1e2a2c68 authored by unknown's avatar unknown
Browse files

BW适配

parent 9c04fee1
Pipeline #1991 passed with stage
...@@ -14,32 +14,32 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 ...@@ -14,32 +14,32 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_NET_GDR_LEVEL=SYS export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0 export NCCL_NET_GDR_READ=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CHECKPOINT_PATH=./tmp #$1 #<Specify path> CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path> TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
DATA_PATH="./dataset/my-gpt2_text_document" #<Specify path and file prefix>_text_document DATA_PATH="/datasets/oscar-1GB-gpt_text_document" #<Specify path and file prefix>_text_document
VOCAB_PATH=./gpt2-vocab.json VOCAB_PATH=./gpt2-vocab.json
MERGE_PATH=./gpt2-merges.txt MERGE_PATH=./gpt2-merges.txt
GPT_MODEL_ARGS=( GPT_MODEL_ARGS=(
--num-layers 32 --num-layers 12
--hidden-size 4096 --hidden-size 768
--num-attention-heads 32 --num-attention-heads 12
--ffn-hidden-size 11008 --ffn-hidden-size 3072
--seq-length 4096 --seq-length 1024
--max-position-embeddings 4096 --max-position-embeddings 1024
) )
# export NVTE_FLASH_ATTN=1 # 走autlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine
# --use-mcore-models
TRAINING_ARGS=( TRAINING_ARGS=(
--log-throughput
--transformer-impl local --transformer-impl local
--use-legacy-models --use-legacy-models
--micro-batch-size 1 --micro-batch-size 1
--global-batch-size 240 --global-batch-size 60 #240 #512 #64
--train-iters 5 --train-iters 100
--weight-decay 0.1 --weight-decay 0.1
--adam-beta1 0.9 --adam-beta1 0.9
--adam-beta2 0.95 --adam-beta2 0.95
...@@ -47,23 +47,26 @@ TRAINING_ARGS=( ...@@ -47,23 +47,26 @@ TRAINING_ARGS=(
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--use-distributed-optimizer --use-distributed-optimizer
--use-flash-attn-triton --ckpt-format torch
--disable-bias-linear --disable-bias-linear
--overlap-grad-reduce
--attention-dropout 0 --attention-dropout 0
--hidden-dropout 0 --hidden-dropout 0
--ddp-average-in-collective
--recompute-granularity full
--recompute-num-layers 5
--recompute-method block
--no-gradient-accumulation-fusion --no-gradient-accumulation-fusion
--swiglu --swiglu
--lr 3.0e-5 --lr 3.0e-5
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 3.0e-6 --min-lr 3.0e-6
--lr-warmup-iters 1 --lr-warmup-iters 1
--use-fast-rms-layernorm
--use-fast-cross-entropy-loss
) )
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--sequence-parallel --sequence-parallel
--tensor-model-parallel-size 1 --tensor-model-parallel-size 2
--pipeline-model-parallel-size 4 --pipeline-model-parallel-size 1
) )
DATA_ARGS=( DATA_ARGS=(
...@@ -88,69 +91,35 @@ EVAL_AND_LOGGING_ARGS=( ...@@ -88,69 +91,35 @@ EVAL_AND_LOGGING_ARGS=(
--tensorboard-dir $TENSORBOARD_LOGS_PATH --tensorboard-dir $TENSORBOARD_LOGS_PATH
) )
APP="python3 -u pretrain_gpt.py \ NNODES=1
${GPT_MODEL_ARGS[@]} \ NODE_RANK=0
${TRAINING_ARGS[@]} \ MASTER_ADDR=localhost
${MODEL_PARALLEL_ARGS[@]} \ while [ $# -gt 0 ]
${DATA_ARGS[@]} \ do
${EVAL_AND_LOGGING_ARGS[@]} case $1 in
--rank ${RANK} \ --NNODES)
--world_size ${WORLD_SIZE} \ NNODES=$2; shift;;
--dist_url tcp://${1}:34566 \ --NODE_RANK)
" NODE_RANK=$2; shift;;
--MASTER_ADDR)
#for hygon cpu MASTER_ADDR=$2; shift;;
case ${lrank} in (*)
[0]) break;;
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac esac
shift
done
DISTRIBUTED_ARGS=(
--nproc_per_node 2
--nnodes $NNODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port 29500
)
export HIP_VISIBLE_DEVICES=2,3 #0,1,2,3,4,5,6,7
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
...@@ -39,70 +39,130 @@ pip install apex* (下载的apex的whl包) ...@@ -39,70 +39,130 @@ pip install apex* (下载的apex的whl包)
</pre> </pre>
若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
3. 安装unsloth # 预训练
## gpt
<pre> ### 下载词汇文件
git clone https://github.com/unslothai/unsloth.git
cd ./unsloth
pip3 install -e .
</pre>
# 下载词汇文件
<pre> <pre>
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
</pre> </pre>
### 下载训练数据
# 下载训练数据
使用1GB 79K jsonl数据集 使用1GB 79K jsonl数据集
<pre> <pre>
wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
xz -d oscar-1GB.jsonl.xz xz -d oscar-1GB.jsonl.xz
</pre> </pre>
解压后为单个`oscar-1GB.jsonl`文件
# 训练 ### 数据预处理
## 数据预处理
<pre> ```shell
python tools/preprocess_data.py \ python tools/preprocess_data.py \
--input oscar-1GB.jsonl \ --input oscar-1GB.jsonl \
--output-prefix ./dataset/my-gpt2 \ --output-prefix ./dataset/oscar-1GB-gpt \
--vocab-file gpt2-vocab.json \ --vocab-file gpt2-vocab.json \
--tokenizer-type GPT2BPETokenizer \ --tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \ --merge-file gpt2-merges.txt \
--append-eod \ --append-eod \
--workers 8 --workers 8
</pre>
参数说明
--input 输入数据集路径,即oscar-1GB.jsonl.xz解压后的文件路径
--output-prefix 输出数据路径,处理后会自动加上_text_document后缀
--vocab-file 下载的gpt2-vocab.json词表文件路径
--tokenizer-type tokenizer类型
--merge-file 下载的gpt2-merges.txt文件路径
--append-eod 添加结束标志符
--workers 进程数
## GPT预训练 # 参数说明
# --input 输入数据集路径,即oscar-1GB.jsonl.xz解压后的文件路径
# --output-prefix 输出数据路径,处理后会自动加上_text_document后缀
# --vocab-file 下载的gpt2-vocab.json词表文件路径
# --tokenizer-type tokenizer类型
# --merge-file 下载的gpt2-merges.txt文件路径
# --append-eod 添加结束标志符
# --workers 进程数
```
### GPT预训练
脚本: `GPT_pretrain.sh`
修改数据集与词汇文件路径
```shell
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
DATA_PATH="./dataset/oscar-1GB-gpt_text_document"
```
- 单机多卡训练
```shell
# 修改脚本中的分布式启动参数
# nproc_per_node表示单节点卡数
# nnodes表示节点数量
# node_rank表示当前节点编号
# master_addr表示主节点地址
# master_port表示通信端口
bash GPT_pretraining.sh >& GPT_pretraining.log
```
`GPT_pretraining.log`中查看训练日志
### 分布式训练 - 多机多卡训练
- 修改DATA_PATH路径
设有节点192.168.1.1和192.168.1.2两个节点
```bash ```shell
VOCAB_FILE=gpt2-vocab.json # 节点192.168.1.1执行下行命令:
MERGE_FILE=gpt2-merges.txt bash GPT_pretraining.sh --NNODES 2 --NODE_RANK 0 --MASTER_ADDR 192.168.1.1 >& GPT_pretraining_rank0.log
DATA_PATH="./dataset/my-gpt2_text_document" # 节点192.168.1.2执行下行命令:
bash GPT_pretraining.sh --NNODES 2 --NODE_RANK 1 --MASTER_ADDR 192.168.1.1 >& GPT_pretraining_rank1.log
``` ```
`GPT_pretraining_rank0.log``GPT_pretraining_rank1.log`中查看训练日志
## llama
### 下载tokenizer文件
- 执行多卡训练 链接: https://www.modelscope.cn/models/shakechen/Llama-2-7b-hf/files
下载其中的tokenizer*文件
### 下载训练数据
使用1GB 79K jsonl数据集
<pre>
wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
xz -d oscar-1GB.jsonl.xz
</pre>
解压后为单个`oscar-1GB.jsonl`文件
### 数据预处理
```shell
python tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix /datasets/oscar-1GB-llama\
--tokenizer-type Llama2Tokenizer \
--tokenizer-model /path/to/llama2_7b_hf/tokenizer.model \
--workers 16 \
--append-eod
```
### llama预训练
脚本: `llama_pretrain.sh`
修改数据集与tokenizer路径
```shell
DATA_PATH="/datasets/oscar-1GB-llama_text_document"
--tokenizer-model /path/to/llama2_7b_hf/tokenizer.model
```
- 单机多卡训练
```shell
bash llama_pretraining.sh >& llama_pretraining.log
``` ```
#np为起的进程数,np\hostfile均需按实际填写 `llama_pretraining.log`中查看训练日志
mpirun -np 4 --hostfile hostfile single.sh localhost(基于单节点四卡)
- 多机多卡训练
设有节点192.168.1.1和192.168.1.2两个节点
```shell
# 节点192.168.1.1执行下行命令:
bash llama_pretraining.sh --NNODES 2 --NODE_RANK 0 --MASTER_ADDR 192.168.1.1 >& llama_pretraining_rank0.log
# 节点192.168.1.2执行下行命令:
bash llama_pretraining.sh --NNODES 2 --NODE_RANK 1 --MASTER_ADDR 192.168.1.1 >& llama_pretraining_rank1.log
``` ```
`GPT_pretraining_rank0.log``GPT_pretraining_rank1.log`中查看训练日志
# 参考 # 参考
......
#!/bin/bash
set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
#export HIP_ALLOC_INITIALIZE=0
#export GPU_MAX_HW_QUEUES=20
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_1,mlx5_2
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
source /opt/dtk/env.sh
# te调用gemm需要导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/datasets/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=(
--num-layers 24
--hidden-size 1024
--ffn-hidden-size 2048
--num-attention-heads 16
--seq-length 4096 #4096
--max-position-embeddings 32768
)
# export NVTE_FLASH_ATTN=1 # 走autlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine
# --use-mcore-models
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 60 #240 #512 #64
--train-iters 100
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--ckpt-format torch
--disable-bias-linear
--overlap-grad-reduce
--attention-dropout 0
--hidden-dropout 0
--ddp-average-in-collective
--recompute-granularity full
--recompute-num-layers 5
--recompute-method block
--no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
)
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 1
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-type Llama2Tokenizer
--tokenizer-model /path/to/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--log-throughput
--save-interval 1000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
DISTRIBUTED_ARGS=(
--nproc_per_node 4
--nnodes 1
--node_rank 0
--master_addr localhost
--master_port 29500
)
export HIP_VISIBLE_DEVICES=0,1,2,3 #4,5,6,7
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
...@@ -13,36 +13,10 @@ from .language_model import parallel_lm_logits ...@@ -13,36 +13,10 @@ from .language_model import parallel_lm_logits
from .language_model import get_language_model from .language_model import get_language_model
#def post_language_model_processing(lm_output, labels, logit_weights,
# parallel_output,
# fp16_lm_cross_entropy):
#
# # Output. Format [s b h]
# output = parallel_lm_logits(
# lm_output,
# logit_weights,
# parallel_output)
#
# if labels is None:
# # [s b h] => [b s h]
# return output.transpose(0,1).contiguous()
# else:
# # [b s] => [s b]
# labels = labels.transpose(0,1).contiguous()
# if fp16_lm_cross_entropy:
# assert output.dtype == torch.half
# loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels)
# else:
# loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels)
#
# # [s b] => [b, s]
# loss = loss.transpose(0,1).contiguous()
# return loss
def post_language_model_processing(lm_output, labels, logit_weights, def post_language_model_processing(lm_output, labels, logit_weights,
parallel_output, parallel_output,
fp16_lm_cross_entropy): fp16_lm_cross_entropy):
args = get_args()
# Output. Format [s b h] # Output. Format [s b h]
output = parallel_lm_logits( output = parallel_lm_logits(
lm_output, lm_output,
...@@ -53,37 +27,19 @@ def post_language_model_processing(lm_output, labels, logit_weights, ...@@ -53,37 +27,19 @@ def post_language_model_processing(lm_output, labels, logit_weights,
# [s b h] => [b s h] # [s b h] => [b s h]
return output.transpose(0,1).contiguous() return output.transpose(0,1).contiguous()
else: else:
if not args.use_fast_cross_entropy_loss: # [b s] => [s b]
# [b s] => [s b] labels = labels.transpose(0,1).contiguous()
labels = labels.transpose(0,1).contiguous() if fp16_lm_cross_entropy:
assert output.dtype == torch.half
if fp16_lm_cross_entropy: loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels)
assert output.dtype == torch.half
loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels)
else:
loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels)
# [s b] => [b, s]
loss = loss.transpose(0,1).contiguous()
else: else:
import os loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels)
from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss
# [s b] => [b, s]
# [s b h] => [b s h] loss = loss.transpose(0,1).contiguous()
output = output.transpose(0,1).contiguous()
logit_softcapping = int(os.getenv("final_logit_softcapping", "0"))
loss = fast_cross_entropy_loss(
logits = output,
labels = labels,
logit_softcapping = logit_softcapping,
)
return loss return loss
class GPTModel(MegatronModule): class GPTModel(MegatronModule):
"""GPT-2 Language model.""" """GPT-2 Language model."""
......
...@@ -2,9 +2,6 @@ ...@@ -2,9 +2,6 @@
import torch import torch
from torch import nn from torch import nn
import torch._dynamo
torch._dynamo.config.suppress_errors = True
class RMSNorm(torch.nn.Module): class RMSNorm(torch.nn.Module):
...@@ -26,11 +23,9 @@ class RMSNorm(torch.nn.Module): ...@@ -26,11 +23,9 @@ class RMSNorm(torch.nn.Module):
setattr(self.weight, 'sequence_parallel', sequence_parallel) setattr(self.weight, 'sequence_parallel', sequence_parallel)
@torch.compile(mode="max-autotune-no-cudagraphs")
def _norm(self, x): def _norm(self, x):
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
@torch.compile(mode="max-autotune-no-cudagraphs")
def forward(self, x): def forward(self, x):
output = self._norm(x.float()).type_as(x) output = self._norm(x.float()).type_as(x)
return output * self.weight return output * self.weight
...@@ -40,9 +40,6 @@ from megatron.legacy.model.utils import ( ...@@ -40,9 +40,6 @@ from megatron.legacy.model.utils import (
) )
from megatron.training import get_args, get_timers from megatron.training import get_args, get_timers
import torch._dynamo
torch._dynamo.config.suppress_errors = True
from .module import MegatronModule from .module import MegatronModule
try: try:
...@@ -59,10 +56,6 @@ except ImportError: ...@@ -59,10 +56,6 @@ except ImportError:
) )
except ImportError: except ImportError:
flash_attn_unpadded_func = None flash_attn_unpadded_func = None
try:
from flash_attn.flash_attn_triton import flash_attn_func
except ImportError:
flash_attn_func = None
""" We use the following notation throughout this file: """ We use the following notation throughout this file:
h: hidden size h: hidden size
...@@ -139,8 +132,8 @@ class ParallelMLP(MegatronModule): ...@@ -139,8 +132,8 @@ class ParallelMLP(MegatronModule):
self.activation_func = openai_gelu self.activation_func = openai_gelu
elif args.onnx_safe: elif args.onnx_safe:
self.activation_func = erf_gelu self.activation_func = erf_gelu
elif args.swiglu: elif args.swiglu: # 注意需要*2 更大的显存
@torch.compile(mode="max-autotune-no-cudagraphs") # compile优化点
def swiglu(x): def swiglu(x):
x = torch.chunk(x, 2, dim=-1) x = torch.chunk(x, 2, dim=-1)
return F.silu(x[0]) * x[1] return F.silu(x[0]) * x[1]
...@@ -164,7 +157,7 @@ class ParallelMLP(MegatronModule): ...@@ -164,7 +157,7 @@ class ParallelMLP(MegatronModule):
input_is_parallel=True, input_is_parallel=True,
is_expert=is_expert, is_expert=is_expert,
) )
@torch.compile(mode="max-autotune-no-cudagraphs")
def forward(self, hidden_states): def forward(self, hidden_states):
# [s, b, 4hp] # [s, b, 4hp]
...@@ -475,10 +468,6 @@ class FlashSelfAttention(torch.nn.Module): ...@@ -475,10 +468,6 @@ class FlashSelfAttention(torch.nn.Module):
self.softmax_scale = softmax_scale self.softmax_scale = softmax_scale
self.dropout_p = attention_dropout self.dropout_p = attention_dropout
# Use FlashAttention-2 when args.use_flash_attn_ck is True
args = get_args()
self.flash_attn_func = flash_attn_unpadded_func
def forward(self, q, k, v): def forward(self, q, k, v):
"""Implements the multihead softmax attention. """Implements the multihead softmax attention.
Arguments Arguments
...@@ -520,38 +509,6 @@ class FlashSelfAttention(torch.nn.Module): ...@@ -520,38 +509,6 @@ class FlashSelfAttention(torch.nn.Module):
output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
return output return output
class FlashSelfAttentionTriton(torch.nn.Module):
"""Implement the scaled dot product attention with softmax.
Arguments
---------
softmax_scale: The temperature to use for the softmax attention.
(default: 1/sqrt(d_keys) where d_keys is computed at
runtime)
attention_dropout: The dropout rate to apply to the attention
(default: 0.0)
"""
def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
device=None, dtype=None):
super().__init__()
assert flash_attn_func is not None, ('Triton version of FlashAttention is not installed.')
assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
self.causal = causal
self.softmax_scale = softmax_scale
self.dropout_p = attention_dropout
def forward(self, q, k, v):
"""Implements the multihead softmax attention.
Arguments
---------
q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
"""
assert q.dtype in [torch.float16, torch.bfloat16]
assert q.is_cuda
q, k, v = [rearrange(x, 's b h d -> b h s d').contiguous()
for x in (q, k, v)]
output = flash_attn_func(q, k, v, self.causal)
output = rearrange(output, 'b s h d -> h b (s d)').contiguous()
return output
class ParallelAttention(MegatronModule): class ParallelAttention(MegatronModule):
"""Parallel self-attention layer abstract class. """Parallel self-attention layer abstract class.
...@@ -580,19 +537,13 @@ class ParallelAttention(MegatronModule): ...@@ -580,19 +537,13 @@ class ParallelAttention(MegatronModule):
else: else:
kv_projection_size = args.kv_channels * args.num_attention_heads kv_projection_size = args.kv_channels * args.num_attention_heads
self.use_flash_attn = (args.use_flash_attn_ck or args.use_flash_attn_triton) \ self.use_flash_attn = args.use_flash_attn \
and attention_type == AttnType.self_attn \ and attention_type == AttnType.self_attn \
and self.attn_mask_type == AttnMaskType.causal and self.attn_mask_type == AttnMaskType.causal
self.use_flash_attn_triton = args.use_flash_attn_triton
if self.use_flash_attn: if self.use_flash_attn:
if args.use_flash_attn_ck: if flash_attn_unpadded_func is None:
if flash_attn_unpadded_func is None: raise ImportError('FlashAttention is not installed, please install with '
raise ImportError('FlashAttention is not installed, please install with '
'pip install flash-attn') 'pip install flash-attn')
if args.use_flash_attn_triton:
assert flash_attn_func != None, "Cannot import FlashAttention triton "
assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports ' assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
'self-attention for now') 'self-attention for now')
assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only ' assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
...@@ -652,10 +603,7 @@ class ParallelAttention(MegatronModule): ...@@ -652,10 +603,7 @@ class ParallelAttention(MegatronModule):
self.attn_mask_type) self.attn_mask_type)
self.checkpoint_core_attention = config.recompute_granularity == 'selective' self.checkpoint_core_attention = config.recompute_granularity == 'selective'
# Currently FlashAttention only works with causal mask if self.use_flash_attn:
if self.use_flash_attn_triton:
self.core_attention_flash = FlashSelfAttentionTriton(causal=True, attention_dropout=args.attention_dropout)
elif self.use_flash_attn:
self.core_attention_flash = FlashSelfAttention( self.core_attention_flash = FlashSelfAttention(
causal=True, attention_dropout=config.attention_dropout causal=True, attention_dropout=config.attention_dropout
) )
...@@ -763,7 +711,7 @@ class ParallelAttention(MegatronModule): ...@@ -763,7 +711,7 @@ class ParallelAttention(MegatronModule):
dim=3) dim=3)
# [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] - # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
query_layer = query_layer.contiguous().view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head) query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
else: else:
# Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
mixed_kv_layer, _ = self.key_value(encoder_output) mixed_kv_layer, _ = self.key_value(encoder_output)
...@@ -868,17 +816,14 @@ class ParallelAttention(MegatronModule): ...@@ -868,17 +816,14 @@ class ParallelAttention(MegatronModule):
context_layer = self.core_attention( context_layer = self.core_attention(
query_layer, key_layer, value_layer, attention_mask) query_layer, key_layer, value_layer, attention_mask)
else: else:
if not self.use_flash_attn_triton: q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous()
#q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
for x in (query_layer, key_layer, value_layer)] for x in (query_layer, key_layer, value_layer)]
if not self.sequence_parallel: if not self.sequence_parallel:
with tensor_parallel.get_cuda_rng_tracker().fork(): with tensor_parallel.get_cuda_rng_tracker().fork():
context_layer = self.core_attention_flash(query_layer, key_layer, value_layer) context_layer = self.core_attention_flash(q, k, v)
else: else:
context_layer = self.core_attention_flash(query_layer, key_layer, value_layer) context_layer = self.core_attention_flash(q, k, v)
if not self.use_flash_attn_triton: context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
# ================= # =================
# Output. [sq, b, h] # Output. [sq, b, h]
...@@ -1229,8 +1174,6 @@ class ParallelTransformerLayer(MegatronModule): ...@@ -1229,8 +1174,6 @@ class ParallelTransformerLayer(MegatronModule):
# hidden_states: [s, b, h] # hidden_states: [s, b, h]
# Layer norm at the beginning of the transformer layer. # Layer norm at the beginning of the transformer layer.
# from unsloth.kernels.rms_layernorm import fast_rms_layernorm
# norm_output = self.input_norm(hidden_states) if not args.use_fast_rms_layernorm else fast_rms_layernorm(self.input_norm, hidden_states)
norm_output = self.input_norm(hidden_states) norm_output = self.input_norm(hidden_states)
# Self attention. # Self attention.
......
...@@ -9,8 +9,6 @@ import torch ...@@ -9,8 +9,6 @@ import torch
from megatron.training import get_args from megatron.training import get_args
from megatron.legacy.model import LayerNorm, RMSNorm from megatron.legacy.model import LayerNorm, RMSNorm
from megatron.core.jit import jit_fuser from megatron.core.jit import jit_fuser
import torch._dynamo
torch._dynamo.config.suppress_errors = True
def init_method_normal(sigma): def init_method_normal(sigma):
"""Init method based on N(0, sigma).""" """Init method based on N(0, sigma)."""
...@@ -60,7 +58,7 @@ def openai_gelu(x): ...@@ -60,7 +58,7 @@ def openai_gelu(x):
def erf_gelu(x): def erf_gelu(x):
return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
@torch.compile(mode="max-autotune-no-cudagraphs")
def get_norm(config): def get_norm(config):
args = get_args() args = get_args()
if args.normalization == "LayerNorm": if args.normalization == "LayerNorm":
......
...@@ -51,7 +51,6 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): ...@@ -51,7 +51,6 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_one_logger_args(parser) parser = _add_one_logger_args(parser)
parser = _add_ft_package_args(parser) parser = _add_ft_package_args(parser)
parser = _add_config_logger_args(parser) parser = _add_config_logger_args(parser)
parser = _add_unsloth_args(parser)
# Custom arguments. # Custom arguments.
if extra_args_provider is not None: if extra_args_provider is not None:
...@@ -72,8 +71,8 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): ...@@ -72,8 +71,8 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
# Args from environment # Args from environment
#args.rank = int(os.getenv('RANK', '0')) args.rank = int(os.getenv('RANK', '0'))
#args.world_size = int(os.getenv("WORLD_SIZE", '1')) args.world_size = int(os.getenv("WORLD_SIZE", '1'))
return args return args
...@@ -538,8 +537,6 @@ def validate_args(args, defaults={}): ...@@ -538,8 +537,6 @@ def validate_args(args, defaults={}):
if args.decoupled_lr is not None or args.decoupled_min_lr is not None: if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
assert not args.use_legacy_models, \ assert not args.use_legacy_models, \
'--decoupled-lr and --decoupled-min-lr is not supported in legacy models.' '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
# FlashAttention
args.use_flash_attn = args.use_flash_attn_ck or args.use_flash_attn_triton
# Legacy RoPE arguments # Legacy RoPE arguments
if args.use_rotary_position_embeddings: if args.use_rotary_position_embeddings:
...@@ -573,7 +570,7 @@ def validate_args(args, defaults={}): ...@@ -573,7 +570,7 @@ def validate_args(args, defaults={}):
"Expert parallelism is not supported with fp16 training." "Expert parallelism is not supported with fp16 training."
# Distributed checkpointing checks # Distributed checkpointing checks
args.use_dist_ckpt = False # print(f"args.use_dist_ckpt: {args.use_dist_ckpt}")
if args.use_dist_ckpt and args.use_legacy_models: if args.use_dist_ckpt and args.use_legacy_models:
raise RuntimeError('--use-dist-ckpt is not supported in legacy models.') raise RuntimeError('--use-dist-ckpt is not supported in legacy models.')
...@@ -1220,11 +1217,9 @@ def _add_training_args(parser): ...@@ -1220,11 +1217,9 @@ def _add_training_args(parser):
group.add_argument('--cross-entropy-loss-fusion', action='store_true', group.add_argument('--cross-entropy-loss-fusion', action='store_true',
help='Enabled fusion of cross entropy loss calculation.', help='Enabled fusion of cross entropy loss calculation.',
dest='cross_entropy_loss_fusion') dest='cross_entropy_loss_fusion')
group.add_argument('--use-flash-attn-ck', action='store_true', group.add_argument('--use-flash-attn', action='store_true',
help='use FlashAttention implementation of attention. ' help='use FlashAttention implementation of attention. '
'https://arxiv.org/abs/2205.14135') 'https://arxiv.org/abs/2205.14135')
group.add_argument('--use-flash-attn-triton', action='store_true',
help='use FlashAttention implementation of attention using Triton.')
group.add_argument('--disable-bias-linear', action='store_false', group.add_argument('--disable-bias-linear', action='store_false',
help='Disable bias in the linear layers', help='Disable bias in the linear layers',
dest='add_bias_linear') dest='add_bias_linear')
...@@ -1574,9 +1569,7 @@ def _add_distributed_args(parser): ...@@ -1574,9 +1569,7 @@ def _add_distributed_args(parser):
default=False, help='If set, use custom-built ring exchange ' default=False, help='If set, use custom-built ring exchange '
'for p2p communications. Note that this option will require ' 'for p2p communications. Note that this option will require '
'a custom built image that support ring-exchange p2p.') 'a custom built image that support ring-exchange p2p.')
# group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')), group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')),
# help='local rank passed from distributed launcher.')
group.add_argument('--local_rank', type=int, default=None,
help='local rank passed from distributed launcher.') help='local rank passed from distributed launcher.')
group.add_argument('--lazy-mpu-init', type=bool, required=False, group.add_argument('--lazy-mpu-init', type=bool, required=False,
help='If set to True, initialize_megatron() ' help='If set to True, initialize_megatron() '
...@@ -1602,12 +1595,6 @@ def _add_distributed_args(parser): ...@@ -1602,12 +1595,6 @@ def _add_distributed_args(parser):
help='If set, distributed ranks initialize order is changed ' help='If set, distributed ranks initialize order is changed '
'from tp-dp-pp to tp-pp-dp. Make sure EP and CP aren\'t used ' 'from tp-dp-pp to tp-pp-dp. Make sure EP and CP aren\'t used '
'with this option enabled') 'with this option enabled')
group.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
group.add_argument('--world_size', type=int, default=8,
help='number of nodes for distributed training')
group.add_argument('--dist_url',
help='Which master node url for distributed training.')
return parser return parser
...@@ -1703,7 +1690,6 @@ def _add_data_args(parser): ...@@ -1703,7 +1690,6 @@ def _add_data_args(parser):
'GPTSentencePieceTokenizer', 'GPTSentencePieceTokenizer',
'HuggingFaceTokenizer', 'HuggingFaceTokenizer',
'Llama2Tokenizer', 'Llama2Tokenizer',
'QwenTokenizer',
'TikTokenizer', 'TikTokenizer',
'NullTokenizer'], 'NullTokenizer'],
help='What type of tokenizer to use.') help='What type of tokenizer to use.')
...@@ -1942,13 +1928,3 @@ def _add_experimental_args(parser): ...@@ -1942,13 +1928,3 @@ def _add_experimental_args(parser):
group.add_argument('--yaml-cfg', type=str, default=None, group.add_argument('--yaml-cfg', type=str, default=None,
help = 'Config file to add additional arguments') help = 'Config file to add additional arguments')
return parser return parser
def _add_unsloth_args(parser):
group = parser.add_argument_group(title='unsloth')
group.add_argument('--use-fast-cross-entropy-loss', action='store_true',
help='Use fast_cross_entropy_loss of unsloth more faster in calculating loss')
group.add_argument('--use-fast-rms-layernorm', action='store_true',
help='Use fast_rms_layernorm of unsloth more faster in Layer Normalization')
return parser
...@@ -20,9 +20,9 @@ from megatron.core import mpu, tensor_parallel, dist_checkpointing ...@@ -20,9 +20,9 @@ from megatron.core import mpu, tensor_parallel, dist_checkpointing
from megatron.core.dist_checkpointing.mapping import ShardedObject from megatron.core.dist_checkpointing.mapping import ShardedObject
from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy
from megatron.core.dist_checkpointing.state_dict_transformation import ( from megatron.core.dist_checkpointing.state_dict_transformation import (
prepare_state_dict_for_save, prepare_state_dict_for_save,
recreate_state_dict_after_load, recreate_state_dict_after_load,
) )
from megatron.core.dist_checkpointing.strategies.fully_parallel import \ from megatron.core.dist_checkpointing.strategies.fully_parallel import \
FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper
from megatron.core.num_microbatches_calculator import update_num_microbatches from megatron.core.num_microbatches_calculator import update_num_microbatches
......
...@@ -170,11 +170,11 @@ def _compile_dependencies(): ...@@ -170,11 +170,11 @@ def _compile_dependencies():
if torch.distributed.get_rank() == 0: if torch.distributed.get_rank() == 0:
start_time = time.time() start_time = time.time()
print("> compiling and loading fused kernels ...", flush=True) print("> compiling and loading fused kernels ...", flush=True)
#fused_kernels.load(args) # fused_kernels.load(args)
torch.distributed.barrier() torch.distributed.barrier()
else: else:
torch.distributed.barrier() torch.distributed.barrier()
#fused_kernels.load(args) # fused_kernels.load(args)
# Simple barrier to make sure all ranks have passed the # Simple barrier to make sure all ranks have passed the
# compilation phase successfully before moving on to the # compilation phase successfully before moving on to the
# rest of the program. We think this might ensure that # rest of the program. We think this might ensure that
...@@ -240,35 +240,20 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks): ...@@ -240,35 +240,20 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
print("> initializing torch distributed ...", flush=True) print("> initializing torch distributed ...", flush=True)
# Manually set the device ids. # Manually set the device ids.
if device_count > 0: if device_count > 0:
#torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
#device_id = torch.device(f'cuda:{args.local_rank}') device_id = torch.device(f'cuda:{args.local_rank}')
device_id = args.rank % device_count
if args.local_rank is not None:
assert (
args.local_rank == device_id
), "expected local-rank to be the same as rank % device-count."
else:
args.local_rank = device_id
torch.cuda.set_device(device_id)
else: else:
device_id = None device_id = None
# Call the init process # Call the init process
torch.distributed.init_process_group( init_process_group_kwargs = {
backend=args.distributed_backend, 'backend' : args.distributed_backend,
world_size=args.world_size, 'world_size': args.world_size,
rank=args.rank, 'rank': args.rank,
init_method=args.dist_url, 'timeout': timedelta(minutes=args.distributed_timeout_minutes),
timeout=timedelta(minutes=args.distributed_timeout_minutes), }
)
#init_process_group_kwargs = {
# 'backend' : args.distributed_backend,
# 'world_size': args.world_size,
# 'rank': args.rank,
# 'timeout': timedelta(minutes=args.distributed_timeout_minutes),
#}
#torch.distributed.init_process_group(**init_process_group_kwargs) torch.distributed.init_process_group(**init_process_group_kwargs)
# Set the tensor model-parallel, pipeline model-parallel, and # Set the tensor model-parallel, pipeline model-parallel, and
# data-parallel communicators. # data-parallel communicators.
...@@ -349,7 +334,7 @@ def set_jit_fusion_options(): ...@@ -349,7 +334,7 @@ def set_jit_fusion_options():
torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False) torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_set_texpr_fuser_enabled(False) torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_set_nvfuser_enabled(False) #True torch._C._jit_set_nvfuser_enabled(True)
torch._C._debug_set_autodiff_subgraph_inlining(False) torch._C._debug_set_autodiff_subgraph_inlining(False)
else: else:
# legacy pytorch fuser # legacy pytorch fuser
......
...@@ -15,7 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer ...@@ -15,7 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
from .bert_tokenization import FullTokenizer as FullBertTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .gpt2_tokenization import GPT2Tokenizer from .gpt2_tokenization import GPT2Tokenizer
from transformers import Qwen2Tokenizer
def build_tokenizer(args, **kwargs): def build_tokenizer(args, **kwargs):
"""Initialize tokenizer.""" """Initialize tokenizer."""
...@@ -49,8 +49,6 @@ def build_tokenizer(args, **kwargs): ...@@ -49,8 +49,6 @@ def build_tokenizer(args, **kwargs):
elif args.tokenizer_type == 'Llama2Tokenizer': elif args.tokenizer_type == 'Llama2Tokenizer':
assert args.tokenizer_model is not None assert args.tokenizer_model is not None
tokenizer = _Llama2Tokenizer(args.tokenizer_model) tokenizer = _Llama2Tokenizer(args.tokenizer_model)
elif args.tokenizer_type == 'QwenTokenizer':
tokenizer = _Qwen2Tokenizer(args.vocab_file, args.merge_file)
elif args.tokenizer_type == 'TikTokenizer': elif args.tokenizer_type == 'TikTokenizer':
assert args.tokenizer_model is not None assert args.tokenizer_model is not None
assert args.tiktoken_pattern is not None assert args.tiktoken_pattern is not None
...@@ -134,43 +132,6 @@ class _HuggingFaceTokenizer(MegatronTokenizer): ...@@ -134,43 +132,6 @@ class _HuggingFaceTokenizer(MegatronTokenizer):
def eod(self): def eod(self):
return self._tokenizer.eos_token_id return self._tokenizer.eos_token_id
class _Qwen2Tokenizer(MegatronTokenizer):
def __init__(self, vocab_file, merge_file,extra_vocab_size=0):
super().__init__(vocab_file, merge_file)
self.tokenizer = Qwen2Tokenizer(vocab_file, merge_file)
self.extra_vocab_size = extra_vocab_size
self.tokenizer.add_special_tokens(special_tokens_dict=dict(pad_token="<|extra_0|>"))
@property
def vocab_size(self):
return len(self.tokenizer.encoder) + self.extra_vocab_size
@property
def vocab(self):
return self.tokenizer.encoder
@property
def inv_vocab(self):
return self.tokenizer.decoder
def tokenize(self, text):
return self.tokenizer.encode(text)
def detokenize(self, token_ids):
return self.tokenizer.decode(token_ids)
@property
def eod(self):
return self.tokenizer.eos_token_id
@property
def eos_token(self):
return self.tokenizer.eos_token
@property
def pad_token_id(self):
return self.tokenizer.pad_token_id
class _BertWordPieceTokenizer(MegatronTokenizer): class _BertWordPieceTokenizer(MegatronTokenizer):
"""Original BERT wordpiece tokenizer.""" """Original BERT wordpiece tokenizer."""
......
...@@ -1090,7 +1090,7 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler, ...@@ -1090,7 +1090,7 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler,
# Recover timing # Recover timing
timers('interval-time', log_level=0).start(barrier=True) timers('interval-time', log_level=0).start(barrier=True)
# @torch.compile(mode="max-autotune-no-cudagraphs") #
def train(forward_step_func, model, optimizer, opt_param_scheduler, def train(forward_step_func, model, optimizer, opt_param_scheduler,
train_data_iterator, valid_data_iterator, train_data_iterator, valid_data_iterator,
process_non_loss_data_func, config, checkpointing_context): process_non_loss_data_func, config, checkpointing_context):
......
...@@ -5,3 +5,13 @@ six ...@@ -5,3 +5,13 @@ six
regex regex
pyyaml pyyaml
sentencepiece sentencepiece
# ==== test ====
nltk
pytest
requests
wrapt
tensorboard
tensorboardX
scipy
psutil
\ No newline at end of file
...@@ -68,3 +68,7 @@ def test_local_multi_tensor_apply(): ...@@ -68,3 +68,7 @@ def test_local_multi_tensor_apply():
False, False,
) )
torch.testing.assert_close(norm_apex, norm_local) torch.testing.assert_close(norm_apex, norm_local)
if __name__ == '__main__':
test_local_multi_tensor_l2_norm_and_scale()
test_local_multi_tensor_apply()
\ No newline at end of file
============================= test session starts ==============================
platform linux -- Python 3.10.12, pytest-8.3.3, pluggy-1.5.0
rootdir: /workspace
configfile: pytest.ini
plugins: mock-3.14.0
collected 562 items / 3 errors
==================================== ERRORS ====================================
_______ ERROR collecting tests/unit_tests/data/test_preprocess_mmdata.py _______
tests/unit_tests/data/test_preprocess_mmdata.py:14: in <module>
from tools.preprocess_mmdata import Encoder
tools/preprocess_mmdata.py:12: in <module>
from torchvision.transforms import ToTensor
/usr/local/lib/python3.10/site-packages/torchvision/__init__.py:10: in <module>
from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils # usort:skip
/usr/local/lib/python3.10/site-packages/torchvision/_meta_registrations.py:164: in <module>
def meta_nms(dets, scores, iou_threshold):
/usr/local/lib/python3.10/site-packages/torch/library.py:654: in register
use_lib._register_fake(op_name, func, _stacklevel=stacklevel + 1)
/usr/local/lib/python3.10/site-packages/torch/library.py:154: in _register_fake
handle = entry.abstract_impl.register(func_to_register, source)
/usr/local/lib/python3.10/site-packages/torch/_library/abstract_impl.py:31: in register
if torch._C._dispatch_has_kernel_for_dispatch_key(self.qualname, "Meta"):
E RuntimeError: operator torchvision::nms does not exist
__ ERROR collecting tests/unit_tests/dist_checkpointing/models/test_mamba.py ___
ImportError while importing test module '/workspace/tests/unit_tests/dist_checkpointing/models/test_mamba.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
megatron/core/ssm/mamba_mixer.py:41: in <module>
from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
E ModuleNotFoundError: No module named 'mamba_ssm'
During handling of the above exception, another exception occurred:
/usr/local/lib/python3.10/importlib/__init__.py:126: in import_module
return _bootstrap._gcd_import(name[level:], package, level)
tests/unit_tests/dist_checkpointing/models/test_mamba.py:17: in <module>
from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
megatron/core/ssm/mamba_mixer.py:47: in <module>
raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
E ImportError: mamba-ssm is required by the Mamba model but cannot be imported
_________ ERROR collecting tests/unit_tests/models/test_mamba_model.py _________
ImportError while importing test module '/workspace/tests/unit_tests/models/test_mamba_model.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
megatron/core/ssm/mamba_mixer.py:41: in <module>
from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
E ModuleNotFoundError: No module named 'mamba_ssm'
During handling of the above exception, another exception occurred:
/usr/local/lib/python3.10/importlib/__init__.py:126: in import_module
return _bootstrap._gcd_import(name[level:], package, level)
tests/unit_tests/models/test_mamba_model.py:7: in <module>
from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
megatron/core/models/mamba/mamba_layer_specs.py:11: in <module>
from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
megatron/core/ssm/mamba_mixer.py:47: in <module>
raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
E ImportError: mamba-ssm is required by the Mamba model but cannot be imported
=============================== warnings summary ===============================
megatron/core/tensor_parallel/layers.py:280
/workspace/megatron/core/tensor_parallel/layers.py:280: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
def forward(ctx, input, weight, bias, allreduce_dgrad):
megatron/core/tensor_parallel/layers.py:290
/workspace/megatron/core/tensor_parallel/layers.py:290: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
def backward(ctx, grad_output):
megatron/core/tensor_parallel/layers.py:381
/workspace/megatron/core/tensor_parallel/layers.py:381: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
def forward(
megatron/core/tensor_parallel/layers.py:420
/workspace/megatron/core/tensor_parallel/layers.py:420: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
def backward(ctx, grad_output):
megatron/core/transformer/attention.py:29
/workspace/megatron/core/transformer/attention.py:29: DeprecationWarning: The 'megatron.core.transformer.custom_layers.transformer_engine'
module is deprecated and will be removed in 0.10.0. Please use
'megatron.core.extensions.transformer_engine' instead.
from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim
megatron/core/dist_checkpointing/strategies/torch.py:17
/workspace/megatron/core/dist_checkpointing/strategies/torch.py:17: DeprecationWarning: `torch.distributed._sharded_tensor` will be deprecated, use `torch.distributed._shard.sharded_tensor` instead
from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor
tests/unit_tests/dist_checkpointing/test_async_save.py:74
/workspace/tests/unit_tests/dist_checkpointing/test_async_save.py:74: PytestUnknownMarkWarning: Unknown pytest.mark.flaky_in_dev - is this a typo? You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
@pytest.mark.flaky_in_dev
tests/unit_tests/dist_checkpointing/test_fp8.py:55
/workspace/tests/unit_tests/dist_checkpointing/test_fp8.py:55: PytestUnknownMarkWarning: Unknown pytest.mark.flaky_in_dev - is this a typo? You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
@pytest.mark.flaky_in_dev
tests/unit_tests/test_utilities.py:11
/workspace/tests/unit_tests/test_utilities.py:11: PytestCollectionWarning: cannot collect test class 'TestModel' because it has a __init__ constructor (from: tests/unit_tests/distributed/test_param_and_grad_buffer.py)
class TestModel(torch.nn.Module):
tests/unit_tests/test_utilities.py:11
/workspace/tests/unit_tests/test_utilities.py:11: PytestCollectionWarning: cannot collect test class 'TestModel' because it has a __init__ constructor (from: tests/unit_tests/test_utilities.py)
class TestModel(torch.nn.Module):
tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:20
/workspace/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:20: PytestUnknownMarkWarning: Unknown pytest.mark.timeout - is this a typo? You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
@pytest.mark.timeout(120)
tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:36
/workspace/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:36: PytestUnknownMarkWarning: Unknown pytest.mark.timeout - is this a typo? You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
@pytest.mark.timeout(120)
tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:52
/workspace/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:52: PytestUnknownMarkWarning: Unknown pytest.mark.timeout - is this a typo? You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
@pytest.mark.timeout(120)
tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:71
/workspace/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:71: PytestUnknownMarkWarning: Unknown pytest.mark.timeout - is this a typo? You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
@pytest.mark.timeout(120)
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
=========================== short test summary info ============================
ERROR tests/unit_tests/data/test_preprocess_mmdata.py - RuntimeError: operato...
ERROR tests/unit_tests/dist_checkpointing/models/test_mamba.py
ERROR tests/unit_tests/models/test_mamba_model.py
!!!!!!!!!!!!!!!!!!! Interrupted: 3 errors during collection !!!!!!!!!!!!!!!!!!!!
======================== 14 warnings, 3 errors in 2.70s ========================
...@@ -203,7 +203,7 @@ def get_args(): ...@@ -203,7 +203,7 @@ def get_args():
choices=['BertWordPieceLowerCase','BertWordPieceCase', choices=['BertWordPieceLowerCase','BertWordPieceCase',
'GPT2BPETokenizer', 'SentencePieceTokenizer', 'GPT2BPETokenizer', 'SentencePieceTokenizer',
'GPTSentencePieceTokenizer', 'Llama2Tokenizer', 'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
'Llama3Tokenizer', 'MistralTokenizer', 'QwenTokenizer', 'NullTokenizer'], 'Llama3Tokenizer', 'MistralTokenizer', 'NullTokenizer'],
help='What type of tokenizer to use.') help='What type of tokenizer to use.')
group.add_argument('--tokenizer-model', type=str, default=None, group.add_argument('--tokenizer-model', type=str, default=None,
help='YTTM tokenizer model.') help='YTTM tokenizer model.')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment