Commit 4b255099 authored by liangjing's avatar liangjing
Browse files

add qwen

parent f2464dc2
Pipeline #1871 passed with stage
......@@ -1703,6 +1703,7 @@ def _add_data_args(parser):
'GPTSentencePieceTokenizer',
'HuggingFaceTokenizer',
'Llama2Tokenizer',
'QwenTokenizer',
'TikTokenizer',
'NullTokenizer'],
help='What type of tokenizer to use.')
......
......@@ -15,7 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .gpt2_tokenization import GPT2Tokenizer
from transformers import Qwen2Tokenizer
def build_tokenizer(args, **kwargs):
"""Initialize tokenizer."""
......@@ -49,6 +49,8 @@ def build_tokenizer(args, **kwargs):
elif args.tokenizer_type == 'Llama2Tokenizer':
assert args.tokenizer_model is not None
tokenizer = _Llama2Tokenizer(args.tokenizer_model)
elif args.tokenizer_type == 'QwenTokenizer':
tokenizer = _Qwen2Tokenizer(args.vocab_file, args.merge_file)
elif args.tokenizer_type == 'TikTokenizer':
assert args.tokenizer_model is not None
assert args.tiktoken_pattern is not None
......@@ -132,6 +134,43 @@ class _HuggingFaceTokenizer(MegatronTokenizer):
def eod(self):
return self._tokenizer.eos_token_id
class _Qwen2Tokenizer(MegatronTokenizer):
def __init__(self, vocab_file, merge_file,extra_vocab_size=0):
super().__init__(vocab_file, merge_file)
self.tokenizer = Qwen2Tokenizer(vocab_file, merge_file)
self.extra_vocab_size = extra_vocab_size
self.tokenizer.add_special_tokens(special_tokens_dict=dict(pad_token="<|extra_0|>"))
@property
def vocab_size(self):
return len(self.tokenizer.encoder) + self.extra_vocab_size
@property
def vocab(self):
return self.tokenizer.encoder
@property
def inv_vocab(self):
return self.tokenizer.decoder
def tokenize(self, text):
return self.tokenizer.encode(text)
def detokenize(self, token_ids):
return self.tokenizer.decode(token_ids)
@property
def eod(self):
return self.tokenizer.eos_token_id
@property
def eos_token(self):
return self.tokenizer.eos_token
@property
def pad_token_id(self):
return self.tokenizer.pad_token_id
class _BertWordPieceTokenizer(MegatronTokenizer):
"""Original BERT wordpiece tokenizer."""
......
#!/bin/bash
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=SYS
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=xx #based on your environment
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
source /opt/dtk/env.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
DATA_PATH="/path_to_my-qwen_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=(
--num-layers 28
--hidden-size 3584
--ffn-hidden-size 18944
--num-attention-heads 28
--seq-length 4096
--max-position-embeddings 32768
--num-query-groups 4
--group-query-attention
)
TRAINING_ARGS=(
--log-throughput
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 12 #512
--train-iters 100
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-triton
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--no-gradient-accumulation-fusion
--add-qkv-bias
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--use-fast-cross-entropy-loss
)
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 4
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-type QwenTokenizer
--merge-file /path_to_qwen_token/merges.txt
--vocab-file /path_to_qwen_token/vocab.json
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--save-interval 1000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
APP="python3 -u pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
"
#for hygon
case ${lrank} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment