BW适配

1e2a2c68 · unknown · 9c04fee1 · 1e2a2c68 · 1e2a2c68 · 1e2a2c68
Commit 1e2a2c68 authored Nov 27, 2024 by unknown
17 changed files
--- a/single.sh
+++ b/single.sh
@@ -14,32 +14,32 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NCCL_NET_GDR_LEVEL=SYS
 export NCCL_NET_GDR_READ=0
-lrank=$OMPI_COMM_WORLD_LOCAL_RANK
-RANK=$OMPI_COMM_WORLD_RANK
-WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
 CHECKPOINT_PATH=./tmp #$1 #<Specify path>
 TENSORBOARD_LOGS_PATH=./tmp  #$2 #<Specify path>
-DATA_PATH="./dataset/my-gpt2_text_document" #<Specify path and file prefix>_text_document
+DATA_PATH="/datasets/oscar-1GB-gpt_text_document" #<Specify path and file prefix>_text_document
 VOCAB_PATH=./gpt2-vocab.json
 MERGE_PATH=./gpt2-merges.txt
 GPT_MODEL_ARGS=(
-    --num-layers 32 
+    --num-layers 12
-    --hidden-size 4096
+    --hidden-size 768
-    --num-attention-heads 32
+    --num-attention-heads 12
-    --ffn-hidden-size 11008
+    --ffn-hidden-size 3072
-    --seq-length 4096 
+    --seq-length 1024
-    --max-position-embeddings 4096
+    --max-position-embeddings 1024
 )
+# export NVTE_FLASH_ATTN=1 # 走autlass
+# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
+# --transformer-impl transformer_engine
+    # --use-mcore-models
 TRAINING_ARGS=(
-    --log-throughput
    --transformer-impl local
    --use-legacy-models 
    --micro-batch-size 1 
-    --global-batch-size 240 
+    --global-batch-size 60 #240 #512 #64
-    --train-iters 5 
+    --train-iters 100
    --weight-decay 0.1 
    --adam-beta1 0.9 
    --adam-beta2 0.95 
@@ -47,23 +47,26 @@ TRAINING_ARGS=(
    --clip-grad 1.0 
    --bf16
    --use-distributed-optimizer 
-    --use-flash-attn-triton
+    --ckpt-format torch
    --disable-bias-linear
+    --overlap-grad-reduce
    --attention-dropout 0
    --hidden-dropout 0
+    --ddp-average-in-collective
+    --recompute-granularity full
+    --recompute-num-layers 5
+    --recompute-method block
    --no-gradient-accumulation-fusion
    --swiglu
    --lr 3.0e-5 
    --lr-decay-style cosine 
    --min-lr 3.0e-6
    --lr-warmup-iters 1
-    --use-fast-rms-layernorm
-    --use-fast-cross-entropy-loss
 )
 MODEL_PARALLEL_ARGS=(
        --sequence-parallel
-	--tensor-model-parallel-size 1 
+	--tensor-model-parallel-size 2
-	--pipeline-model-parallel-size 4 
+	--pipeline-model-parallel-size 1
 )
 DATA_ARGS=(
@@ -88,69 +91,35 @@ EVAL_AND_LOGGING_ARGS=(
    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
 )
-APP="python3  -u pretrain_gpt.py \
+NNODES=1
-     ${GPT_MODEL_ARGS[@]} \
+NODE_RANK=0
-     ${TRAINING_ARGS[@]} \
+MASTER_ADDR=localhost
-     ${MODEL_PARALLEL_ARGS[@]} \
+while [ $# -gt 0 ] 
-     ${DATA_ARGS[@]} \
+do
-     ${EVAL_AND_LOGGING_ARGS[@]}
+case $1 in
-     --rank ${RANK} \
+  --NNODES)
-     --world_size ${WORLD_SIZE} \
+    NNODES=$2; shift;;
-     --dist_url tcp://${1}:34566 \
+  --NODE_RANK)
-    "
+    NODE_RANK=$2; shift;;
+  --MASTER_ADDR)
-#for hygon cpu
+    MASTER_ADDR=$2; shift;;
-case ${lrank} in
+  (*)
-[0])
+    break;;
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[1])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=1 --membind=1 ${APP}
-  ;;
-[2])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=2 --membind=2 ${APP}
-  ;;
-[3])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=3 --membind=3 ${APP}
-  ;;
-[4])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=4 --membind=4 ${APP}
-  ;;
-[5])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=5 --membind=5 ${APP}
-  ;;
-[6])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=6 --membind=6 ${APP}
-  ;;
-[7])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=7 --membind=7 ${APP}
-  ;;
 esac
+shift
+done
+DISTRIBUTED_ARGS=(
+    --nproc_per_node 2
+    --nnodes $NNODES
+    --node_rank $NODE_RANK
+    --master_addr $MASTER_ADDR
+    --master_port 29500
+)
+export HIP_VISIBLE_DEVICES=2,3 #0,1,2,3,4,5,6,7
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/README.md
+++ b/README.md
@@ -39,70 +39,130 @@ pip install apex* (下载的apex的whl包)
 </pre>
 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
-3. 安装unsloth
+# 预训练
+## gpt
-<pre>
+### 下载词汇文件
-git clone https://github.com/unslothai/unsloth.git
-cd ./unsloth
-pip3 install -e .
-</pre>
-# 下载词汇文件
 <pre>
 wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
 wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
 </pre>
+### 下载训练数据
-# 下载训练数据
 使用1GB 79K jsonl数据集
 <pre>
 wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
 xz -d oscar-1GB.jsonl.xz
 </pre>
+解压后为单个`oscar-1GB.jsonl`文件
-# 训练
+### 数据预处理
-## 数据预处理
-<pre>
+```shell
 python tools/preprocess_data.py \
    --input oscar-1GB.jsonl \ 
-    --output-prefix ./dataset/my-gpt2 \
+    --output-prefix ./dataset/oscar-1GB-gpt \
    --vocab-file gpt2-vocab.json \
    --tokenizer-type GPT2BPETokenizer \
    --merge-file gpt2-merges.txt \
    --append-eod \
    --workers 8
-</pre>
-参数说明
--input				输入数据集路径，即oscar-1GB.jsonl.xz解压后的文件路径
--output-prefix		输出数据路径，处理后会自动加上_text_document后缀
--vocab-file				下载的gpt2-vocab.json词表文件路径
--tokenizer-type 	tokenizer类型
--merge-file		下载的gpt2-merges.txt文件路径		
--append-eod		添加结束标志符		
--workers			进程数
-## GPT预训练
+# 参数说明
+# --input				输入数据集路径，即oscar-1GB.jsonl.xz解压后的文件路径
+# --output-prefix		输出数据路径，处理后会自动加上_text_document后缀
+# --vocab-file				下载的gpt2-vocab.json词表文件路径
+# --tokenizer-type 	tokenizer类型
+# --merge-file		下载的gpt2-merges.txt文件路径		
+# --append-eod		添加结束标志符		
+# --workers			进程数
+```
+### GPT预训练
+脚本: `GPT_pretrain.sh`
+修改数据集与词汇文件路径
+```shell
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+DATA_PATH="./dataset/oscar-1GB-gpt_text_document"
+```
+- 单机多卡训练
+  ```shell
+  # 修改脚本中的分布式启动参数
+  # nproc_per_node表示单节点卡数
+  # nnodes表示节点数量
+  # node_rank表示当前节点编号
+  # master_addr表示主节点地址
+  # master_port表示通信端口
+  bash GPT_pretraining.sh >& GPT_pretraining.log
+  ```
+  在`GPT_pretraining.log`中查看训练日志
-### 分布式训练
+- 多机多卡训练
- 修改DATA_PATH路径
+  设有节点192.168.1.1和192.168.1.2两个节点
-  ```bash
+  ```shell
-  VOCAB_FILE=gpt2-vocab.json
+  # 节点192.168.1.1执行下行命令:
-  MERGE_FILE=gpt2-merges.txt
+  bash GPT_pretraining.sh --NNODES 2 --NODE_RANK 0 --MASTER_ADDR 192.168.1.1 >& GPT_pretraining_rank0.log
-  DATA_PATH="./dataset/my-gpt2_text_document"
+  # 节点192.168.1.2执行下行命令:
+  bash GPT_pretraining.sh --NNODES 2 --NODE_RANK 1 --MASTER_ADDR 192.168.1.1 >& GPT_pretraining_rank1.log
  ```
+  在`GPT_pretraining_rank0.log`和`GPT_pretraining_rank1.log`中查看训练日志
+## llama
+### 下载tokenizer文件
- 执行多卡训练
+链接: https://www.modelscope.cn/models/shakechen/Llama-2-7b-hf/files
+下载其中的tokenizer*文件
+### 下载训练数据
+使用1GB 79K jsonl数据集
+<pre>
+wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
+xz -d oscar-1GB.jsonl.xz
+</pre>
+解压后为单个`oscar-1GB.jsonl`文件
+### 数据预处理
+```shell
+python tools/preprocess_data.py \
+  --input oscar-1GB.jsonl \
+  --output-prefix /datasets/oscar-1GB-llama\
+  --tokenizer-type Llama2Tokenizer \
+  --tokenizer-model /path/to/llama2_7b_hf/tokenizer.model \
+  --workers 16 \
+  --append-eod
+```
+### llama预训练
+脚本: `llama_pretrain.sh`
+修改数据集与tokenizer路径
+```shell
+DATA_PATH="/datasets/oscar-1GB-llama_text_document"
+--tokenizer-model /path/to/llama2_7b_hf/tokenizer.model
+```
+- 单机多卡训练
+  ```shell
+  bash llama_pretraining.sh >& llama_pretraining.log
  ```
-  #np为起的进程数，np\hostfile均需按实际填写
+  在`llama_pretraining.log`中查看训练日志
-  mpirun -np 4 --hostfile hostfile single.sh localhost（基于单节点四卡）
+- 多机多卡训练
+  设有节点192.168.1.1和192.168.1.2两个节点
+  ```shell
+  # 节点192.168.1.1执行下行命令:
+  bash llama_pretraining.sh --NNODES 2 --NODE_RANK 0 --MASTER_ADDR 192.168.1.1 >& llama_pretraining_rank0.log
+  # 节点192.168.1.2执行下行命令:
+  bash llama_pretraining.sh --NNODES 2 --NODE_RANK 1 --MASTER_ADDR 192.168.1.1 >& llama_pretraining_rank1.log
  ```
+  在`GPT_pretraining_rank0.log`和`GPT_pretraining_rank1.log`中查看训练日志
 # 参考

--- a/llama_pretraining.sh
+++ b/llama_pretraining.sh
+#!/bin/bash
+set -eux
+#export FLASH_ATTENTION_PRINT_PARAM=1
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=PXB # SYS
+#export HIP_ALLOC_INITIALIZE=0
+#export GPU_MAX_HW_QUEUES=20
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_HCA=mlx5_1,mlx5_2
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=0
+source /opt/dtk/env.sh
+# te调用gemm需要导入hipblaslt库
+# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 
+CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
+DATA_PATH="/datasets/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
+GPT_MODEL_ARGS=(
+    --num-layers 24
+    --hidden-size 1024
+    --ffn-hidden-size 2048
+    --num-attention-heads 16
+    --seq-length 4096 #4096
+    --max-position-embeddings 32768
+)
+# export NVTE_FLASH_ATTN=1 # 走autlass
+# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
+# --transformer-impl transformer_engine
+    # --use-mcore-models
+TRAINING_ARGS=(
+    --transformer-impl local
+    --use-legacy-models 
+    --micro-batch-size 1 
+    --global-batch-size 60 #240 #512 #64
+    --train-iters 100
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --use-distributed-optimizer 
+    --ckpt-format torch
+    --disable-bias-linear
+    --overlap-grad-reduce
+    --attention-dropout 0
+    --hidden-dropout 0
+    --ddp-average-in-collective
+    --recompute-granularity full
+    --recompute-num-layers 5
+    --recompute-method block
+    --no-gradient-accumulation-fusion
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+)
+MODEL_PARALLEL_ARGS=(
+    --sequence-parallel
+	--tensor-model-parallel-size 4
+	--pipeline-model-parallel-size 1
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --split 949,50,1
+    --untie-embeddings-and-output-weights
+    --use-rotary-position-embeddings 
+    --normalization RMSNorm 
+    --no-position-embedding 
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model /path/to/llama2_7b_hf/tokenizer.model
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --log-throughput
+    --save-interval 1000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+DISTRIBUTED_ARGS=(
+    --nproc_per_node 4
+    --nnodes 1
+    --node_rank 0
+    --master_addr localhost
+    --master_port 29500
+)
+export HIP_VISIBLE_DEVICES=0,1,2,3 #4,5,6,7
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/megatron/legacy/model/gpt_model.py
+++ b/megatron/legacy/model/gpt_model.py
@@ -13,36 +13,10 @@ from .language_model import parallel_lm_logits
 from .language_model import get_language_model
-#def post_language_model_processing(lm_output, labels, logit_weights,
-#                                   parallel_output,
-#                                   fp16_lm_cross_entropy):
-#
-#    # Output. Format [s b h]
-#    output = parallel_lm_logits(
-#        lm_output,
-#        logit_weights,
-#        parallel_output)
-#
-#    if labels is None:
-#        # [s b h] => [b s h]
-#        return output.transpose(0,1).contiguous()
-#    else:
-#        # [b s] => [s b]
-#        labels = labels.transpose(0,1).contiguous()
-#        if fp16_lm_cross_entropy:
-#            assert output.dtype == torch.half
-#            loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels)
-#        else:
-#            loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels)
-#        
-#        # [s b] => [b, s]
-#        loss = loss.transpose(0,1).contiguous()
-#        return loss
 def post_language_model_processing(lm_output, labels, logit_weights,
                                   parallel_output,
                                   fp16_lm_cross_entropy):
-    args = get_args()
    # Output. Format [s b h]
    output = parallel_lm_logits(
        lm_output,
@@ -53,37 +27,19 @@ def post_language_model_processing(lm_output, labels, logit_weights,
        # [s b h] => [b s h]
        return output.transpose(0,1).contiguous()
    else:
-        if not args.use_fast_cross_entropy_loss:
+        # [b s] => [s b]
-            # [b s] => [s b]
+        labels = labels.transpose(0,1).contiguous()
-            labels = labels.transpose(0,1).contiguous()
+        if fp16_lm_cross_entropy:
+            assert output.dtype == torch.half
-            if fp16_lm_cross_entropy:
+            loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels)
-                assert output.dtype == torch.half
-                loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels)
-            else:
-                loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels)
-            # [s b] => [b, s]
-            loss = loss.transpose(0,1).contiguous()
        else:
-            import os
+            loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels)
-            from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss
+        # [s b] => [b, s]
-            # [s b h] => [b s h]
+        loss = loss.transpose(0,1).contiguous()
-            output = output.transpose(0,1).contiguous()
-            logit_softcapping = int(os.getenv("final_logit_softcapping", "0"))
-            loss = fast_cross_entropy_loss(
-                logits = output,
-                labels = labels,
-                logit_softcapping = logit_softcapping,
-            )
        return loss
 class GPTModel(MegatronModule):
    """GPT-2 Language model."""

--- a/megatron/legacy/model/rms_norm.py
+++ b/megatron/legacy/model/rms_norm.py
@@ -2,9 +2,6 @@
 import torch
 from torch import nn
-import torch._dynamo
-torch._dynamo.config.suppress_errors = True
 class RMSNorm(torch.nn.Module):
@@ -26,11 +23,9 @@ class RMSNorm(torch.nn.Module):
        setattr(self.weight, 'sequence_parallel', sequence_parallel)
-    @torch.compile(mode="max-autotune-no-cudagraphs")
    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    @torch.compile(mode="max-autotune-no-cudagraphs")
    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -40,9 +40,6 @@ from megatron.legacy.model.utils import (
 )
 from megatron.training import get_args, get_timers
-import torch._dynamo
-torch._dynamo.config.suppress_errors = True
 from .module import MegatronModule
 try:
@@ -59,10 +56,6 @@ except ImportError:
        )
    except ImportError:
        flash_attn_unpadded_func = None
-try:
-    from flash_attn.flash_attn_triton import flash_attn_func
-except ImportError:
-    flash_attn_func = None
 """ We use the following notation throughout this file:
     h: hidden size
@@ -139,8 +132,8 @@ class ParallelMLP(MegatronModule):
            self.activation_func = openai_gelu
        elif args.onnx_safe:
            self.activation_func = erf_gelu
-        elif args.swiglu:
+        elif args.swiglu: # 注意需要*2 更大的显存
-            @torch.compile(mode="max-autotune-no-cudagraphs")
+            # compile优化点
            def swiglu(x):
                x = torch.chunk(x, 2, dim=-1)
                return F.silu(x[0]) * x[1]
@@ -164,7 +157,7 @@ class ParallelMLP(MegatronModule):
            input_is_parallel=True,
            is_expert=is_expert,
        )
-    @torch.compile(mode="max-autotune-no-cudagraphs")
    def forward(self, hidden_states):
        # [s, b, 4hp]
@@ -475,10 +468,6 @@ class FlashSelfAttention(torch.nn.Module):
        self.softmax_scale = softmax_scale
        self.dropout_p = attention_dropout
-        # Use FlashAttention-2 when args.use_flash_attn_ck is True
-        args = get_args()
-        self.flash_attn_func = flash_attn_unpadded_func
    def forward(self, q, k, v):
        """Implements the multihead softmax attention.
        Arguments
@@ -520,38 +509,6 @@ class FlashSelfAttention(torch.nn.Module):
        output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
        return output
-class FlashSelfAttentionTriton(torch.nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Arguments
-    ---------
-        softmax_scale: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.0)
-    """
-    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
-                 device=None, dtype=None):
-        super().__init__()
-        assert flash_attn_func is not None, ('Triton version of FlashAttention is not installed.')
-        assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-    def forward(self, q, k, v):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
-        """
-        assert q.dtype in [torch.float16, torch.bfloat16]
-        assert q.is_cuda
-        q, k, v = [rearrange(x, 's b h d -> b h s d').contiguous()
-                       for x in (q, k, v)]
-        output = flash_attn_func(q, k, v, self.causal)
-        output = rearrange(output, 'b s h d -> h b (s d)').contiguous()
-        return output
 class ParallelAttention(MegatronModule):
    """Parallel self-attention layer abstract class.
@@ -580,19 +537,13 @@ class ParallelAttention(MegatronModule):
        else:
            kv_projection_size = args.kv_channels * args.num_attention_heads
-        self.use_flash_attn = (args.use_flash_attn_ck or args.use_flash_attn_triton) \
+        self.use_flash_attn = args.use_flash_attn \
            and attention_type == AttnType.self_attn \
            and self.attn_mask_type == AttnMaskType.causal
-        self.use_flash_attn_triton = args.use_flash_attn_triton
        if self.use_flash_attn:
-            if args.use_flash_attn_ck:
+            if flash_attn_unpadded_func is None:
-                if flash_attn_unpadded_func is None:
+                raise ImportError('FlashAttention is not installed, please install with '
-                   raise ImportError('FlashAttention is not installed, please install with '
                                  'pip install flash-attn')
-            if args.use_flash_attn_triton:
-                assert flash_attn_func != None, "Cannot import FlashAttention triton "
            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
                                                          'self-attention for now')
            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
@@ -652,10 +603,7 @@ class ParallelAttention(MegatronModule):
                                            self.attn_mask_type)
        self.checkpoint_core_attention = config.recompute_granularity == 'selective'
-        # Currently FlashAttention only works with causal mask
+        if self.use_flash_attn:
-        if self.use_flash_attn_triton:
-            self.core_attention_flash = FlashSelfAttentionTriton(causal=True, attention_dropout=args.attention_dropout)
-        elif self.use_flash_attn:
            self.core_attention_flash = FlashSelfAttention(
                causal=True, attention_dropout=config.attention_dropout
            )
@@ -763,7 +711,7 @@ class ParallelAttention(MegatronModule):
                dim=3)
            # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
-            query_layer = query_layer.contiguous().view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
        else:
            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
            mixed_kv_layer, _ = self.key_value(encoder_output)
@@ -868,17 +816,14 @@ class ParallelAttention(MegatronModule):
                context_layer = self.core_attention(
                    query_layer, key_layer, value_layer, attention_mask)
        else:
-            if not self.use_flash_attn_triton:
+            q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
-                query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous()
-            #q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
                       for x in (query_layer, key_layer, value_layer)]
            if not self.sequence_parallel:
                with tensor_parallel.get_cuda_rng_tracker().fork():
-                    context_layer = self.core_attention_flash(query_layer, key_layer, value_layer)
+                    context_layer = self.core_attention_flash(q, k, v)
            else:
-                context_layer = self.core_attention_flash(query_layer, key_layer, value_layer)
+                context_layer = self.core_attention_flash(q, k, v)
-            if not self.use_flash_attn_triton:
+            context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
-                context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
        # =================
        # Output. [sq, b, h]
@@ -1229,8 +1174,6 @@ class ParallelTransformerLayer(MegatronModule):
        # hidden_states: [s, b, h]
        # Layer norm at the beginning of the transformer layer.
-        # from unsloth.kernels.rms_layernorm import fast_rms_layernorm
-        # norm_output = self.input_norm(hidden_states) if not args.use_fast_rms_layernorm else fast_rms_layernorm(self.input_norm, hidden_states)
        norm_output = self.input_norm(hidden_states)
        # Self attention.

--- a/megatron/legacy/model/utils.py
+++ b/megatron/legacy/model/utils.py
@@ -9,8 +9,6 @@ import torch
 from megatron.training import get_args
 from megatron.legacy.model import LayerNorm, RMSNorm
 from megatron.core.jit import jit_fuser
-import torch._dynamo
-torch._dynamo.config.suppress_errors = True
 def init_method_normal(sigma):
    """Init method based on N(0, sigma)."""
@@ -60,7 +58,7 @@ def openai_gelu(x):
 def erf_gelu(x):
    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
-@torch.compile(mode="max-autotune-no-cudagraphs")
 def get_norm(config):
    args = get_args()
    if args.normalization == "LayerNorm":

--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -51,7 +51,6 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
    parser = _add_one_logger_args(parser)
    parser = _add_ft_package_args(parser)
    parser = _add_config_logger_args(parser)
-    parser = _add_unsloth_args(parser)
    # Custom arguments.
    if extra_args_provider is not None:
@@ -72,8 +71,8 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
    # Args from environment
-    #args.rank = int(os.getenv('RANK', '0'))
+    args.rank = int(os.getenv('RANK', '0'))
-    #args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
    return args
@@ -538,8 +537,6 @@ def validate_args(args, defaults={}):
    if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
        assert not args.use_legacy_models, \
            '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
-    # FlashAttention
-    args.use_flash_attn = args.use_flash_attn_ck or args.use_flash_attn_triton
    # Legacy RoPE arguments
    if args.use_rotary_position_embeddings:
@@ -573,7 +570,7 @@ def validate_args(args, defaults={}):
            "Expert parallelism is not supported with fp16 training."
    # Distributed checkpointing checks
-    args.use_dist_ckpt = False
+    # print(f"args.use_dist_ckpt: {args.use_dist_ckpt}")
    if args.use_dist_ckpt and args.use_legacy_models:
        raise RuntimeError('--use-dist-ckpt is not supported in legacy models.')
@@ -1220,11 +1217,9 @@ def _add_training_args(parser):
    group.add_argument('--cross-entropy-loss-fusion', action='store_true',
                       help='Enabled fusion of cross entropy loss calculation.',
                       dest='cross_entropy_loss_fusion')
-    group.add_argument('--use-flash-attn-ck', action='store_true',
+    group.add_argument('--use-flash-attn', action='store_true',
                       help='use FlashAttention implementation of attention. '
                       'https://arxiv.org/abs/2205.14135')
-    group.add_argument('--use-flash-attn-triton', action='store_true',
-                       help='use FlashAttention implementation of attention using Triton.')
    group.add_argument('--disable-bias-linear', action='store_false',
                       help='Disable bias in the linear layers',
                       dest='add_bias_linear')
@@ -1574,9 +1569,7 @@ def _add_distributed_args(parser):
                       default=False, help='If set, use custom-built ring exchange '
                       'for p2p communications. Note that this option will require '
                       'a custom built image that support ring-exchange p2p.')
-#    group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')),
+    group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')),
-#                       help='local rank passed from distributed launcher.')
-    group.add_argument('--local_rank', type=int, default=None,
                       help='local rank passed from distributed launcher.')
    group.add_argument('--lazy-mpu-init', type=bool, required=False,
                       help='If set to True, initialize_megatron() '
@@ -1602,12 +1595,6 @@ def _add_distributed_args(parser):
                        help='If set, distributed ranks initialize order is changed '
                        'from tp-dp-pp to tp-pp-dp. Make sure EP and CP aren\'t used '
                        'with this option enabled')
-    group.add_argument('--rank', default=-1, type=int,
-                       help='node rank for distributed training')
-    group.add_argument('--world_size', type=int, default=8,
-                       help='number of nodes for distributed training')
-    group.add_argument('--dist_url',
-                       help='Which master node url for distributed training.')
    return parser
@@ -1703,7 +1690,6 @@ def _add_data_args(parser):
                                'GPTSentencePieceTokenizer',
                                'HuggingFaceTokenizer',
                                'Llama2Tokenizer',
-                                'QwenTokenizer',
                                'TikTokenizer',
                                'NullTokenizer'],
                       help='What type of tokenizer to use.')
@@ -1942,13 +1928,3 @@ def _add_experimental_args(parser):
    group.add_argument('--yaml-cfg', type=str, default=None,
                       help = 'Config file to add additional arguments')
    return parser
-def _add_unsloth_args(parser):
-    group = parser.add_argument_group(title='unsloth')
-    group.add_argument('--use-fast-cross-entropy-loss', action='store_true',
-                       help='Use fast_cross_entropy_loss of unsloth more faster in calculating loss')
-    group.add_argument('--use-fast-rms-layernorm', action='store_true',
-                       help='Use fast_rms_layernorm of unsloth more faster in Layer Normalization')
-    return parser
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -20,9 +20,9 @@ from megatron.core import mpu, tensor_parallel, dist_checkpointing
 from megatron.core.dist_checkpointing.mapping import ShardedObject
 from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy
 from megatron.core.dist_checkpointing.state_dict_transformation import (
-     prepare_state_dict_for_save,
+    prepare_state_dict_for_save,
-     recreate_state_dict_after_load,
+    recreate_state_dict_after_load,
-     )
+)
 from megatron.core.dist_checkpointing.strategies.fully_parallel import \
    FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper
 from megatron.core.num_microbatches_calculator import update_num_microbatches

--- a/megatron/training/initialize.py
+++ b/megatron/training/initialize.py
@@ -170,11 +170,11 @@ def _compile_dependencies():
    if torch.distributed.get_rank() == 0:
        start_time = time.time()
        print("> compiling and loading fused kernels ...", flush=True)
-        #fused_kernels.load(args)
+        # fused_kernels.load(args)
        torch.distributed.barrier()
    else:
        torch.distributed.barrier()
-        #fused_kernels.load(args)
+        # fused_kernels.load(args)
    # Simple barrier to make sure all ranks have passed the
    # compilation phase successfully before moving on to the
    # rest of the program. We think this might ensure that
@@ -240,35 +240,20 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
            print("> initializing torch distributed ...", flush=True)
        # Manually set the device ids.
        if device_count > 0:
-            #torch.cuda.set_device(args.local_rank)
+            torch.cuda.set_device(args.local_rank)
-            #device_id = torch.device(f'cuda:{args.local_rank}')
+            device_id = torch.device(f'cuda:{args.local_rank}')
-            device_id = args.rank % device_count
-            if args.local_rank is not None:
-                assert (
-                    args.local_rank == device_id
-                ), "expected local-rank to be the same as rank % device-count."
-            else:
-                args.local_rank = device_id
-            torch.cuda.set_device(device_id)
        else:
            device_id = None
        # Call the init process
-        torch.distributed.init_process_group(
+        init_process_group_kwargs = {
-                backend=args.distributed_backend,
+            'backend' : args.distributed_backend,
-                world_size=args.world_size,
+            'world_size': args.world_size,
-                rank=args.rank,
+            'rank': args.rank,
-                init_method=args.dist_url,
+            'timeout': timedelta(minutes=args.distributed_timeout_minutes),
-                timeout=timedelta(minutes=args.distributed_timeout_minutes),
+        }
-                )
-        #init_process_group_kwargs = {
-        #    'backend' : args.distributed_backend,
-        #    'world_size': args.world_size,
-        #    'rank': args.rank,
-        #    'timeout': timedelta(minutes=args.distributed_timeout_minutes),
-        #}
-        #torch.distributed.init_process_group(**init_process_group_kwargs)
+        torch.distributed.init_process_group(**init_process_group_kwargs)
    # Set the tensor model-parallel, pipeline model-parallel, and
    # data-parallel communicators.
@@ -349,7 +334,7 @@ def set_jit_fusion_options():
        torch._C._jit_override_can_fuse_on_cpu(False)
        torch._C._jit_override_can_fuse_on_gpu(False)
        torch._C._jit_set_texpr_fuser_enabled(False)
-        torch._C._jit_set_nvfuser_enabled(False) #True
+        torch._C._jit_set_nvfuser_enabled(True)
        torch._C._debug_set_autodiff_subgraph_inlining(False)
    else:
        # legacy pytorch fuser

--- a/megatron/training/tokenizer/tokenizer.py
+++ b/megatron/training/tokenizer/tokenizer.py
@@ -15,7 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
 from .gpt2_tokenization import GPT2Tokenizer
-from transformers import Qwen2Tokenizer
 def build_tokenizer(args, **kwargs):
    """Initialize tokenizer."""
@@ -49,8 +49,6 @@ def build_tokenizer(args, **kwargs):
    elif args.tokenizer_type == 'Llama2Tokenizer':
        assert args.tokenizer_model is not None
        tokenizer = _Llama2Tokenizer(args.tokenizer_model)
-    elif args.tokenizer_type == 'QwenTokenizer':
-        tokenizer = _Qwen2Tokenizer(args.vocab_file, args.merge_file)
    elif args.tokenizer_type == 'TikTokenizer':
        assert args.tokenizer_model is not None
        assert args.tiktoken_pattern is not None
@@ -134,43 +132,6 @@ class _HuggingFaceTokenizer(MegatronTokenizer):
    def eod(self):
        return self._tokenizer.eos_token_id
-class _Qwen2Tokenizer(MegatronTokenizer):
-    def __init__(self, vocab_file, merge_file,extra_vocab_size=0):
-        super().__init__(vocab_file, merge_file)
-        self.tokenizer = Qwen2Tokenizer(vocab_file, merge_file)
-        self.extra_vocab_size = extra_vocab_size
-        self.tokenizer.add_special_tokens(special_tokens_dict=dict(pad_token="<|extra_0|>"))
-    @property
-    def vocab_size(self):
-        return len(self.tokenizer.encoder) + self.extra_vocab_size
-    @property
-    def vocab(self):
-        return self.tokenizer.encoder
-    @property
-    def inv_vocab(self):
-        return self.tokenizer.decoder
-    def tokenize(self, text):
-        return self.tokenizer.encode(text)
-    def detokenize(self, token_ids):
-        return self.tokenizer.decode(token_ids)
-    @property
-    def eod(self):
-        return self.tokenizer.eos_token_id
-    @property
-    def eos_token(self):
-        return self.tokenizer.eos_token
-    @property
-    def pad_token_id(self):
-        return self.tokenizer.pad_token_id
 class _BertWordPieceTokenizer(MegatronTokenizer):
    """Original BERT wordpiece tokenizer."""

--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -1090,7 +1090,7 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler,
    # Recover timing
    timers('interval-time', log_level=0).start(barrier=True)
+# @torch.compile(mode="max-autotune-no-cudagraphs") # 
 def train(forward_step_func, model, optimizer, opt_param_scheduler,
          train_data_iterator, valid_data_iterator,
          process_non_loss_data_func, config, checkpointing_context):

--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,13 @@ six
 regex
 pyyaml
 sentencepiece
+# ==== test ====
+nltk
+pytest
+requests 
+wrapt
+tensorboard
+tensorboardX
+scipy
+psutil
\ No newline at end of file
--- a/tests/deletion_log.txt
+++ b/tests/deletion_log.txt
--- a/tests/unit_tests/test_local_multi_tensor_fns.py
+++ b/tests/unit_tests/test_local_multi_tensor_fns.py
@@ -68,3 +68,7 @@ def test_local_multi_tensor_apply():
        False,
    )
    torch.testing.assert_close(norm_apex, norm_local)
+if __name__ == '__main__':
+    test_local_multi_tensor_l2_norm_and_scale()
+    test_local_multi_tensor_apply()
\ No newline at end of file
--- a/tests/unit_tests/test_unit.log
+++ b/tests/unit_tests/test_unit.log
+============================= test session starts ==============================
+platform linux -- Python 3.10.12, pytest-8.3.3, pluggy-1.5.0
+rootdir: /workspace
+configfile: pytest.ini
+plugins: mock-3.14.0
+collected 562 items / 3 errors
+==================================== ERRORS ====================================
+_______ ERROR collecting tests/unit_tests/data/test_preprocess_mmdata.py _______
+tests/unit_tests/data/test_preprocess_mmdata.py:14: in <module>
+    from tools.preprocess_mmdata import Encoder
+tools/preprocess_mmdata.py:12: in <module>
+    from torchvision.transforms import ToTensor
+/usr/local/lib/python3.10/site-packages/torchvision/__init__.py:10: in <module>
+    from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils  # usort:skip
+/usr/local/lib/python3.10/site-packages/torchvision/_meta_registrations.py:164: in <module>
+    def meta_nms(dets, scores, iou_threshold):
+/usr/local/lib/python3.10/site-packages/torch/library.py:654: in register
+    use_lib._register_fake(op_name, func, _stacklevel=stacklevel + 1)
+/usr/local/lib/python3.10/site-packages/torch/library.py:154: in _register_fake
+    handle = entry.abstract_impl.register(func_to_register, source)
+/usr/local/lib/python3.10/site-packages/torch/_library/abstract_impl.py:31: in register
+    if torch._C._dispatch_has_kernel_for_dispatch_key(self.qualname, "Meta"):
+E   RuntimeError: operator torchvision::nms does not exist
+__ ERROR collecting tests/unit_tests/dist_checkpointing/models/test_mamba.py ___
+ImportError while importing test module '/workspace/tests/unit_tests/dist_checkpointing/models/test_mamba.py'.
+Hint: make sure your test modules/packages have valid Python names.
+Traceback:
+megatron/core/ssm/mamba_mixer.py:41: in <module>
+    from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
+E   ModuleNotFoundError: No module named 'mamba_ssm'
+During handling of the above exception, another exception occurred:
+/usr/local/lib/python3.10/importlib/__init__.py:126: in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+tests/unit_tests/dist_checkpointing/models/test_mamba.py:17: in <module>
+    from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
+megatron/core/ssm/mamba_mixer.py:47: in <module>
+    raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
+E   ImportError: mamba-ssm is required by the Mamba model but cannot be imported
+_________ ERROR collecting tests/unit_tests/models/test_mamba_model.py _________
+ImportError while importing test module '/workspace/tests/unit_tests/models/test_mamba_model.py'.
+Hint: make sure your test modules/packages have valid Python names.
+Traceback:
+megatron/core/ssm/mamba_mixer.py:41: in <module>
+    from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
+E   ModuleNotFoundError: No module named 'mamba_ssm'
+During handling of the above exception, another exception occurred:
+/usr/local/lib/python3.10/importlib/__init__.py:126: in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+tests/unit_tests/models/test_mamba_model.py:7: in <module>
+    from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+megatron/core/models/mamba/mamba_layer_specs.py:11: in <module>
+    from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
+megatron/core/ssm/mamba_mixer.py:47: in <module>
+    raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
+E   ImportError: mamba-ssm is required by the Mamba model but cannot be imported
+=============================== warnings summary ===============================
+megatron/core/tensor_parallel/layers.py:280
+  /workspace/megatron/core/tensor_parallel/layers.py:280: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+    def forward(ctx, input, weight, bias, allreduce_dgrad):
+megatron/core/tensor_parallel/layers.py:290
+  /workspace/megatron/core/tensor_parallel/layers.py:290: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+    def backward(ctx, grad_output):
+megatron/core/tensor_parallel/layers.py:381
+  /workspace/megatron/core/tensor_parallel/layers.py:381: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+    def forward(
+megatron/core/tensor_parallel/layers.py:420
+  /workspace/megatron/core/tensor_parallel/layers.py:420: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+    def backward(ctx, grad_output):
+megatron/core/transformer/attention.py:29
+  /workspace/megatron/core/transformer/attention.py:29: DeprecationWarning: The 'megatron.core.transformer.custom_layers.transformer_engine' 
+      module is deprecated and will be removed in 0.10.0. Please use 
+      'megatron.core.extensions.transformer_engine' instead.
+    from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim
+megatron/core/dist_checkpointing/strategies/torch.py:17
+  /workspace/megatron/core/dist_checkpointing/strategies/torch.py:17: DeprecationWarning: `torch.distributed._sharded_tensor` will be deprecated, use `torch.distributed._shard.sharded_tensor` instead
+    from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor
+tests/unit_tests/dist_checkpointing/test_async_save.py:74
+  /workspace/tests/unit_tests/dist_checkpointing/test_async_save.py:74: PytestUnknownMarkWarning: Unknown pytest.mark.flaky_in_dev - is this a typo?  You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
+    @pytest.mark.flaky_in_dev
+tests/unit_tests/dist_checkpointing/test_fp8.py:55
+  /workspace/tests/unit_tests/dist_checkpointing/test_fp8.py:55: PytestUnknownMarkWarning: Unknown pytest.mark.flaky_in_dev - is this a typo?  You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
+    @pytest.mark.flaky_in_dev
+tests/unit_tests/test_utilities.py:11
+  /workspace/tests/unit_tests/test_utilities.py:11: PytestCollectionWarning: cannot collect test class 'TestModel' because it has a __init__ constructor (from: tests/unit_tests/distributed/test_param_and_grad_buffer.py)
+    class TestModel(torch.nn.Module):
+tests/unit_tests/test_utilities.py:11
+  /workspace/tests/unit_tests/test_utilities.py:11: PytestCollectionWarning: cannot collect test class 'TestModel' because it has a __init__ constructor (from: tests/unit_tests/test_utilities.py)
+    class TestModel(torch.nn.Module):
+tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:20
+  /workspace/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:20: PytestUnknownMarkWarning: Unknown pytest.mark.timeout - is this a typo?  You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
+    @pytest.mark.timeout(120)
+tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:36
+  /workspace/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:36: PytestUnknownMarkWarning: Unknown pytest.mark.timeout - is this a typo?  You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
+    @pytest.mark.timeout(120)
+tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:52
+  /workspace/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:52: PytestUnknownMarkWarning: Unknown pytest.mark.timeout - is this a typo?  You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
+    @pytest.mark.timeout(120)
+tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:71
+  /workspace/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py:71: PytestUnknownMarkWarning: Unknown pytest.mark.timeout - is this a typo?  You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
+    @pytest.mark.timeout(120)
+-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
+=========================== short test summary info ============================
+ERROR tests/unit_tests/data/test_preprocess_mmdata.py - RuntimeError: operato...
+ERROR tests/unit_tests/dist_checkpointing/models/test_mamba.py
+ERROR tests/unit_tests/models/test_mamba_model.py
+!!!!!!!!!!!!!!!!!!! Interrupted: 3 errors during collection !!!!!!!!!!!!!!!!!!!!
+======================== 14 warnings, 3 errors in 2.70s ========================
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -203,7 +203,7 @@ def get_args():
                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
                                'GPT2BPETokenizer', 'SentencePieceTokenizer',
                                'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
-                                'Llama3Tokenizer', 'MistralTokenizer', 'QwenTokenizer', 'NullTokenizer'],
+                                'Llama3Tokenizer', 'MistralTokenizer', 'NullTokenizer'],
                       help='What type of tokenizer to use.')
    group.add_argument('--tokenizer-model', type=str, default=None,
                       help='YTTM tokenizer model.')