Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
f43ec2dd
Commit
f43ec2dd
authored
Mar 17, 2025
by
wangxj
Browse files
添加legacy的优化项和llama示例脚本
parent
0fc0844f
Pipeline
#2547
passed with stage
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
590 additions
and
1 deletion
+590
-1
examples/llama/Llama3_8b.sh
examples/llama/Llama3_8b.sh
+233
-0
examples/qwen/qwen1.5_14b.sh
examples/qwen/qwen1.5_14b.sh
+252
-0
examples/qwen/qwen_data_process.sh
examples/qwen/qwen_data_process.sh
+8
-0
megatron/legacy/model/transformer.py
megatron/legacy/model/transformer.py
+1
-1
megatron/training/arguments.py
megatron/training/arguments.py
+2
-0
megatron/training/tokenizer/tokenizer.py
megatron/training/tokenizer/tokenizer.py
+94
-0
No files found.
examples/llama/Llama3_8b.sh
0 → 100755
View file @
f43ec2dd
#!/bin/bash
# set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_P2P_LEVEL
=
PXB
# SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
32
# 20
export
NCCL_MAX_NCHANNELS
=
32
# 20
export
NCCL_IB_TIMEOUT
=
22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_TOPO_FILE
=
"/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/data/hipblaslt-install-0904/lib:
$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/data/rocblas-install-0203-release/lib:
$LD_LIBRARY_PATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export
cache_size_limit
=
64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document"
#<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
32
--hidden-size
4096
--ffn-hidden-size
14336
--num-attention-heads
32
--max-position-embeddings
8192
--group-query-attention
--num-query-groups
8
--swiglu
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
export
NVTE_FLASH_ATTN
=
1
# 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS
=(
--transformer-impl
local
# 走legacy用这两组参数
--use-legacy-models
--micro-batch-size
1
--global-batch-size
64
#240 #60 #512 #64
--train-iters
10
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
# --no-gradient-accumulation-fusion
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
# 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 优化项未适配
--use-flash-attn
)
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
2
)
DATA_ARGS
=(
--data-path
$DATA_PATH
--seq-length
4096
#4096
--split
949,50,1
--tokenizer-type
Llama3Tokenizer
--tokenizer-model
/public/home/wangxj/Downloads/model_weights/llama3.2/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
--log-interval
1
--log-throughput
--save-interval
1000
--eval-interval
1000
--save
$SAVE_PATH
--load
$SAVE_PATH
--eval-iters
10
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS
=(
--profile
--profile-step-start
4
--profile-step-end
5
--use-pytorch-profiler
--profile-ranks
0 1 2 3 4 5 6 7
--profile-dir
prof_data
)
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
34577
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
APP
=
"python -u ../../pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
${
APP
}
# case ${LOCAL_RANK} in
# [0])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [1])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=1 --membind=1 ${APP}
# ;;
# [2])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=2 --membind=2 ${APP}
# ;;
# [3])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=3 --membind=3 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [4])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=4 --membind=4 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [5])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=5 --membind=5 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [6])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=6 --membind=6 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [7])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=7 --membind=7 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# esac
\ No newline at end of file
examples/qwen/qwen1.5_14b.sh
0 → 100755
View file @
f43ec2dd
#!/bin/bash
set
-eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_P2P_LEVEL
=
PXB
# SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
32
# 20
export
NCCL_MAX_NCHANNELS
=
32
# 20
export
NCCL_IB_TIMEOUT
=
22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_TOPO_FILE
=
"/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/hipblaslt-install-dtk-25.04-0212/lib:
$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/rocblas-install-0224/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/rocblas-install-0227/lib:
$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export
cache_size_limit
=
64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-qwen_text_document"
#<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
40
--hidden-size
5120
--ffn-hidden-size
13696
--num-attention-heads
40
--max-position-embeddings
32768
--num-query-groups
40
--group-query-attention
--normalization
RMSNorm
--position-embedding-type
rope
# none #
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
export
NVTE_FLASH_ATTN
=
1
# 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS
=(
--transformer-impl
local
# 走legacy用这两组参数
--use-legacy-models
--micro-batch-size
1
--global-batch-size
32
#256 #240 #60 #512 #64
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
# --no-gradient-accumulation-fusion
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
# 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
1
# --num-layers-per-virtual-pipeline-stage 4
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS
=(
--data-path
$DATA_PATH
--seq-length
4096
#4096
--split
949,50,1
--tokenizer-type
QwenTokenizer
--merge-file
/public/home/wangxj/Downloads/model_weights/qwen1.5_14b/merges.txt
--vocab-file
/public/home/wangxj/Downloads/model_weights/qwen1.5_14b/vocab.json
# --tokenizer-model /public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model
# --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
--log-interval
1
--log-throughput
--save-interval
1000
--eval-interval
1000
--save
$SAVE_PATH
--load
$SAVE_PATH
--eval-iters
10
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS
=(
--profile
--profile-step-start
4
--profile-step-end
5
--use-pytorch-profiler
--profile-ranks
0 1 2 3 4 5 6 7
--profile-dir
prof_data
)
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
34577
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
# torchrun --nproc_per_node 8 pretrain_gpt.py \
# ${GPT_MODEL_ARGS[@]} \
# ${TRAINING_ARGS[@]} \
# ${MODEL_PARALLEL_ARGS[@]} \
# ${DATA_ARGS[@]} \
# ${EVAL_AND_LOGGING_ARGS[@]}
APP
=
"python -u ../../pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case
${
LOCAL_RANK
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
examples/qwen/qwen_data_process.sh
0 → 100755
View file @
f43ec2dd
python tools/preprocess_data.py
\
--input
/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head.jsonl
\
--output-prefix
/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-qwen
\
--vocab-file
/public/home/wangxj/Downloads/model_weights/qwen1.5_14b/vocab.json
\
--tokenizer-type
QwenTokenizer
\
--merge-file
/public/home/wangxj/Downloads/model_weights/qwen1.5_14b/merges.txt
\
--append-eod
\
--workers
8
megatron/legacy/model/transformer.py
View file @
f43ec2dd
...
@@ -712,7 +712,7 @@ class ParallelAttention(MegatronModule):
...
@@ -712,7 +712,7 @@ class ParallelAttention(MegatronModule):
dim
=
3
)
dim
=
3
)
# [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
# [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
query_layer
=
query_layer
.
view
(
query_layer
.
size
(
0
),
query_layer
.
size
(
1
),
-
1
,
self
.
hidden_size_per_attention_head
)
query_layer
=
query_layer
.
contiguous
().
view
(
query_layer
.
size
(
0
),
query_layer
.
size
(
1
),
-
1
,
self
.
hidden_size_per_attention_head
)
else
:
else
:
# Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
# Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
mixed_kv_layer
,
_
=
self
.
key_value
(
encoder_output
)
mixed_kv_layer
,
_
=
self
.
key_value
(
encoder_output
)
...
...
megatron/training/arguments.py
View file @
f43ec2dd
...
@@ -2026,6 +2026,8 @@ def _add_tokenizer_args(parser):
...
@@ -2026,6 +2026,8 @@ def _add_tokenizer_args(parser):
'GPTSentencePieceTokenizer'
,
'GPTSentencePieceTokenizer'
,
'HuggingFaceTokenizer'
,
'HuggingFaceTokenizer'
,
'Llama2Tokenizer'
,
'Llama2Tokenizer'
,
'Llama3Tokenizer'
,
'QwenTokenizer'
,
'TikTokenizer'
,
'TikTokenizer'
,
'MultimodalTokenizer'
,
'MultimodalTokenizer'
,
'NullTokenizer'
],
'NullTokenizer'
],
...
...
megatron/training/tokenizer/tokenizer.py
View file @
f43ec2dd
...
@@ -15,6 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
...
@@ -15,6 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.gpt2_tokenization
import
GPT2Tokenizer
from
.gpt2_tokenization
import
GPT2Tokenizer
from
megatron.training.tokenizer.multimodal_tokenizer
import
MultimodalTokenizer
from
megatron.training.tokenizer.multimodal_tokenizer
import
MultimodalTokenizer
from
transformers
import
Qwen2Tokenizer
def
build_tokenizer
(
args
,
**
kwargs
):
def
build_tokenizer
(
args
,
**
kwargs
):
...
@@ -50,6 +51,11 @@ def build_tokenizer(args, **kwargs):
...
@@ -50,6 +51,11 @@ def build_tokenizer(args, **kwargs):
elif
args
.
tokenizer_type
==
'Llama2Tokenizer'
:
elif
args
.
tokenizer_type
==
'Llama2Tokenizer'
:
assert
args
.
tokenizer_model
is
not
None
assert
args
.
tokenizer_model
is
not
None
tokenizer
=
_Llama2Tokenizer
(
args
.
tokenizer_model
)
tokenizer
=
_Llama2Tokenizer
(
args
.
tokenizer_model
)
elif
args
.
tokenizer_type
==
'Llama3Tokenizer'
:
assert
args
.
tokenizer_model
is
not
None
tokenizer
=
_Llama3Tokenizer
(
args
.
tokenizer_model
)
elif
args
.
tokenizer_type
==
'QwenTokenizer'
:
tokenizer
=
_Qwen2Tokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
elif
args
.
tokenizer_type
==
'TikTokenizer'
:
elif
args
.
tokenizer_type
==
'TikTokenizer'
:
assert
args
.
tokenizer_model
is
not
None
assert
args
.
tokenizer_model
is
not
None
assert
args
.
tiktoken_pattern
is
not
None
assert
args
.
tiktoken_pattern
is
not
None
...
@@ -605,6 +611,94 @@ class _Llama2Tokenizer(_SentencePieceTokenizer):
...
@@ -605,6 +611,94 @@ class _Llama2Tokenizer(_SentencePieceTokenizer):
def
additional_special_tokens_ids
(
self
):
def
additional_special_tokens_ids
(
self
):
return
None
return
None
class
_Llama3Tokenizer
(
MegatronTokenizer
):
"""tiktokenTokenizer-Megatron llama3 改写"""
# https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py
def
__init__
(
self
,
model_file
):
super
().
__init__
(
model_file
)
from
pathlib
import
Path
import
tiktoken
from
tiktoken.load
import
load_tiktoken_bpe
tokenizer_path
=
model_file
special_tokens
=
[
"<|begin_of_text|>"
,
"<|end_of_text|>"
,
"<|reserved_special_token_0|>"
,
"<|reserved_special_token_1|>"
,
"<|reserved_special_token_2|>"
,
"<|reserved_special_token_3|>"
,
"<|start_header_id|>"
,
"<|end_header_id|>"
,
"<|reserved_special_token_4|>"
,
"<|eot_id|>"
,
# end of turn
]
+
[
f
"<|reserved_special_token_
{
i
}
|>"
for
i
in
range
(
5
,
256
-
5
)]
mergeable_ranks
=
load_tiktoken_bpe
(
tokenizer_path
)
self
.
tokenizer
=
tiktoken
.
Encoding
(
tokenizer_path
,
pat_str
=
r
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
,
mergeable_ranks
=
mergeable_ranks
,
special_tokens
=
{
token
:
len
(
mergeable_ranks
)
+
i
for
i
,
token
in
enumerate
(
special_tokens
)},
)
self
.
eod_id
=
self
.
tokenizer
.
encode
(
"<|end_of_text|>"
,
allowed_special
=
"all"
)[
0
]
@
property
def
vocab_size
(
self
):
return
self
.
tokenizer
.
n_vocab
@
property
def
vocab
(
self
):
return
self
.
tokenizer
.
encode
@
property
def
inv_vocab
(
self
):
return
self
.
tokenizer
.
encode
def
tokenize
(
self
,
text
):
return
self
.
tokenizer
.
encode
(
text
)
def
detokenize
(
self
,
token_ids
):
return
self
.
tokenizer
.
encode
(
token_ids
)
@
property
def
eod
(
self
):
return
self
.
eod_id
class
_Qwen2Tokenizer
(
MegatronTokenizer
):
def
__init__
(
self
,
vocab_file
,
merge_file
,
extra_vocab_size
=
0
):
super
().
__init__
(
vocab_file
,
merge_file
)
self
.
tokenizer
=
Qwen2Tokenizer
(
vocab_file
,
merge_file
)
self
.
extra_vocab_size
=
extra_vocab_size
self
.
tokenizer
.
add_special_tokens
(
special_tokens_dict
=
dict
(
pad_token
=
"<|extra_0|>"
))
@
property
def
vocab_size
(
self
):
return
len
(
self
.
tokenizer
.
encoder
)
+
self
.
extra_vocab_size
@
property
def
vocab
(
self
):
return
self
.
tokenizer
.
encoder
@
property
def
inv_vocab
(
self
):
return
self
.
tokenizer
.
decoder
def
tokenize
(
self
,
text
):
return
self
.
tokenizer
.
encode
(
text
)
def
detokenize
(
self
,
token_ids
):
return
self
.
tokenizer
.
decode
(
token_ids
)
@
property
def
eod
(
self
):
return
self
.
tokenizer
.
eos_token_id
@
property
def
eos_token
(
self
):
return
self
.
tokenizer
.
eos_token
@
property
def
pad_token_id
(
self
):
return
self
.
tokenizer
.
pad_token_id
def
reload_mergeable_ranks
(
path
:
str
,
max_vocab
:
Optional
[
int
]
=
None
)
->
Dict
[
bytes
,
int
]:
def
reload_mergeable_ranks
(
path
:
str
,
max_vocab
:
Optional
[
int
]
=
None
)
->
Dict
[
bytes
,
int
]:
"""
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment