add qwen

4b255099 · liangjing · f2464dc2 · 4b255099 · 4b255099 · 4b255099
Commit 4b255099 authored Nov 08, 2024 by liangjing
5 changed files
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1703,6 +1703,7 @@ def _add_data_args(parser):
                                'GPTSentencePieceTokenizer',
                                'HuggingFaceTokenizer',
                                'Llama2Tokenizer',
+                                'QwenTokenizer',
                                'TikTokenizer',
                                'NullTokenizer'],
                       help='What type of tokenizer to use.')

--- a/megatron/training/tokenizer/tokenizer.py
+++ b/megatron/training/tokenizer/tokenizer.py
@@ -15,7 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
 from .gpt2_tokenization import GPT2Tokenizer
+from transformers import Qwen2Tokenizer
 def build_tokenizer(args, **kwargs):
    """Initialize tokenizer."""
@@ -49,6 +49,8 @@ def build_tokenizer(args, **kwargs):
    elif args.tokenizer_type == 'Llama2Tokenizer':
        assert args.tokenizer_model is not None
        tokenizer = _Llama2Tokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'QwenTokenizer':
+        tokenizer = _Qwen2Tokenizer(args.vocab_file, args.merge_file)
    elif args.tokenizer_type == 'TikTokenizer':
        assert args.tokenizer_model is not None
        assert args.tiktoken_pattern is not None
@@ -132,6 +134,43 @@ class _HuggingFaceTokenizer(MegatronTokenizer):
    def eod(self):
        return self._tokenizer.eos_token_id
+class _Qwen2Tokenizer(MegatronTokenizer):
+    def __init__(self, vocab_file, merge_file,extra_vocab_size=0):
+        super().__init__(vocab_file, merge_file)
+        self.tokenizer = Qwen2Tokenizer(vocab_file, merge_file)
+        self.extra_vocab_size = extra_vocab_size
+        self.tokenizer.add_special_tokens(special_tokens_dict=dict(pad_token="<|extra_0|>"))
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder) + self.extra_vocab_size
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+    @property
+    def eod(self):
+        return self.tokenizer.eos_token_id
+    @property
+    def eos_token(self):
+        return self.tokenizer.eos_token
+    @property
+    def pad_token_id(self):
+        return self.tokenizer.pad_token_id
 class _BertWordPieceTokenizer(MegatronTokenizer):
    """Original BERT wordpiece tokenizer."""

--- a/scripts/qwen2_7b.sh
+++ b/scripts/qwen2_7b.sh
+#!/bin/bash
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=SYS
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_HCA=xx #based on your environment
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=0
+source /opt/dtk/env.sh
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CHECKPOINT_PATH=./tmp #$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=./tmp  #$2 #<Specify path>
+DATA_PATH="/path_to_my-qwen_text_document" #<Specify path and file prefix>_text_document
+GPT_MODEL_ARGS=(
+    --num-layers 28     
+    --hidden-size 3584   
+    --ffn-hidden-size 18944
+    --num-attention-heads 28 
+    --seq-length 4096 
+    --max-position-embeddings 32768
+    --num-query-groups 4
+    --group-query-attention
+)
+TRAINING_ARGS=(
+    --log-throughput
+    --transformer-impl local
+    --use-legacy-models 
+    --micro-batch-size 1 
+    --global-batch-size 12 #512 
+    --train-iters 100
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --use-distributed-optimizer 
+    --use-flash-attn-triton
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    --no-gradient-accumulation-fusion
+    --add-qkv-bias
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+    --use-fast-cross-entropy-loss
+)
+MODEL_PARALLEL_ARGS=(
+        --sequence-parallel
+	--tensor-model-parallel-size 2
+	--pipeline-model-parallel-size 4
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --split 949,50,1
+    --untie-embeddings-and-output-weights
+    --use-rotary-position-embeddings 
+    --normalization RMSNorm 
+    --no-position-embedding 
+    --tokenizer-type QwenTokenizer
+    --merge-file /path_to_qwen_token/merges.txt
+    --vocab-file /path_to_qwen_token/vocab.json
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --save-interval 1000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+APP="python3  -u pretrain_gpt.py \
+     ${GPT_MODEL_ARGS[@]} \
+     ${TRAINING_ARGS[@]} \
+     ${MODEL_PARALLEL_ARGS[@]} \
+     ${DATA_ARGS[@]} \
+     ${EVAL_AND_LOGGING_ARGS[@]}
+     --rank ${RANK} \
+     --world_size ${WORLD_SIZE} \
+     --dist_url tcp://${1}:34566 \
+    "
+#for hygon
+case ${lrank} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
--- a/scripts/qwen_token/merges.txt
+++ b/scripts/qwen_token/merges.txt
--- a/scripts/qwen_token/vocab.json
+++ b/scripts/qwen_token/vocab.json