set -eux # 多节点环境变量 # Runs the "7B" parameter model export HSA_FORCE_FINE_GRAIN_PCIE=1 export OMP_NUM_THREADS=1 export NCCL_P2P_LEVEL=5 source /opt/dtk/env.sh # te调用gemm需要导入hipblaslt库 # export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH #export HIP_ALLOC_INITIALIZE=0 #export GPU_MAX_HW_QUEUES=20 export NCCL_ALGO=Ring export NCCL_NCHANNELS_PER_PEER=8 export NCCL_MIN_NCHANNELS=20 export NCCL_MIN_P2P_NCHANNELS=8 export NCCL_IB_TIMEOUT=22 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NCCL_IB_HCA=mlx5_1,mlx5_2 #export NCCL_SOCKET_IFNAME=ibs8 export NCCL_NET_GDR_LEVEL=SYS export NCCL_NET_GDR_READ=0 #export NCCL_DEBUG=info # 模型和数据集参数 MODEL="/data/model_weights/llama2_7b_nemo/llama2-7b.nemo" TRAIN_DS="[/data/datasets/mlperf_llama/databricks-dolly-15k/training.jsonl]" VALID_DS="[/data/datasets/mlperf_llama/databricks-dolly-15k/validation.jsonl]" TEST_DS="[/data/datasets/mlperf_llama/databricks-dolly-15k/test.jsonl]" VALID_NAMES="[databricks-dolly-15k]" # 微调数据集占比 # TRAIN_DS="[/path/to/dataset_1.jsonl,/path/to/dataset_2.jsonl]" # CONCAT_SAMPLING_PROBS="[0.3,0.7]" # "[1]" # 只有一个数据集设置为1 CONCAT_SAMPLING_PROBS="[1]" # 可能需要导入的环境变量 export LD_PRELOAD=/usr/local/lib/python3.10/site-packages/transformer_engine.libs/libgalaxyhip-8e217ef3.so.5.2.24472.1059-0a6afed7 # 运行训练脚本 torchrun --nproc_per_node 8 \ ./NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ trainer.precision=bf16 \ trainer.devices=8 \ trainer.num_nodes=1 \ trainer.val_check_interval=15 \ trainer.max_steps=300 \ model.restore_from_path=${MODEL} \ model.micro_batch_size=1 \ model.global_batch_size=60 \ model.tensor_model_parallel_size=2 \ model.pipeline_model_parallel_size=2 \ model.megatron_amp_O2=True \ model.sequence_parallel=True \ model.activations_checkpoint_granularity=selective \ model.activations_checkpoint_method=uniform \ model.optim.name=fused_adam \ model.optim.lr=5e-6 \ model.answer_only_loss=True \ model.peft.peft_scheme=lora \ model.data.train_ds.file_names=${TRAIN_DS} \ model.data.validation_ds.file_names=${VALID_DS} \ model.data.test_ds.file_names=${TEST_DS} \ model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS} \ model.data.train_ds.max_seq_length=4096 \ model.data.validation_ds.max_seq_length=4096 \ model.data.train_ds.num_workers=0 \ model.data.validation_ds.num_workers=0 \ model.data.test_ds.num_workers=0 \ ++cluster_type=BCP