K100AI_finetune.sh 2.57 KB
Newer Older
wxj's avatar
wxj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
set -eux
# 多节点环境变量
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
source /opt/dtk/env.sh
# te调用gemm需要导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 

#export HIP_ALLOC_INITIALIZE=0
#export GPU_MAX_HW_QUEUES=20
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=8
export NCCL_MIN_NCHANNELS=20
export NCCL_MIN_P2P_NCHANNELS=8
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1

export NCCL_IB_HCA=mlx5_1,mlx5_2
#export NCCL_SOCKET_IFNAME=ibs8
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
#export NCCL_DEBUG=info

# 模型和数据集参数
MODEL="/data/model_weights/llama2_7b_nemo/llama2-7b.nemo"
TRAIN_DS="[/data/datasets/mlperf_llama/databricks-dolly-15k/training.jsonl]"
VALID_DS="[/data/datasets/mlperf_llama/databricks-dolly-15k/validation.jsonl]"
TEST_DS="[/data/datasets/mlperf_llama/databricks-dolly-15k/test.jsonl]"
VALID_NAMES="[databricks-dolly-15k]"

# 微调数据集占比
# TRAIN_DS="[/path/to/dataset_1.jsonl,/path/to/dataset_2.jsonl]"
# CONCAT_SAMPLING_PROBS="[0.3,0.7]" # "[1]" # 只有一个数据集设置为1
CONCAT_SAMPLING_PROBS="[1]"

wxj's avatar
wxj committed
38
39
# 可能需要导入环境变量
export LD_PRELOAD=/usr/local/lib/python3.10/site-packages/transformer_engine.libs/libgalaxyhip-8e217ef3.so.5.2.24472.1059-0a6afed7
wxj's avatar
wxj committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# 运行训练脚本
torchrun --nproc_per_node 8 \
   /workspace/nemo_main/NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
   trainer.precision=bf16 \
   trainer.devices=8 \
   trainer.num_nodes=1 \
   trainer.val_check_interval=15 \
   trainer.max_steps=300 \
   model.restore_from_path=${MODEL} \
   model.micro_batch_size=1 \
   model.global_batch_size=60 \
   model.tensor_model_parallel_size=2 \
   model.pipeline_model_parallel_size=2 \
   model.megatron_amp_O2=True \
   model.sequence_parallel=True \
   model.activations_checkpoint_granularity=selective \
   model.activations_checkpoint_method=uniform \
   model.optim.name=fused_adam \
   model.optim.lr=5e-6 \
   model.answer_only_loss=True \
   model.peft.peft_scheme=lora \
   model.data.train_ds.file_names=${TRAIN_DS} \
   model.data.validation_ds.file_names=${VALID_DS} \
   model.data.test_ds.file_names=${TEST_DS} \
   model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS} \
   model.data.train_ds.max_seq_length=4096 \
   model.data.validation_ds.max_seq_length=4096 \
   model.data.train_ds.num_workers=0 \
   model.data.validation_ds.num_workers=0 \
   model.data.test_ds.num_workers=0 \
   ++cluster_type=BCP