delete error files

3392f187 · zhaoying1 · e9bec754 · e9bec754 · e9bec754 · e9bec754
Commit 3392f187 authored Sep 08, 2023 by zhaoying1
7 changed files
--- a/ptuning/logs/6B_ds_ft_bs32_accum1_4cards_zero3_5e-5.jpg
+++ b/ptuning/logs/6B_ds_ft_bs32_accum1_4cards_zero3_5e-5.jpg
--- a/ptuning/logs/6B_ds_pt_bs16_accum1_4cards_zero2_5e-3.jpg
+++ b/ptuning/logs/6B_ds_pt_bs16_accum1_4cards_zero2_5e-3.jpg
--- a/ptuning/logs/pretrain.jpeg
+++ b/ptuning/logs/pretrain.jpeg
--- a/ptuning/mpirun_slurm/.gitkeep
+++ b/ptuning/mpirun_slurm/.gitkeep
--- a/ptuning/mpirun_slurm/run.sh
+++ b/ptuning/mpirun_slurm/run.sh
-#/bin/bash
-mkdir -p logs
-#rm -rf log/*
-mkdir -p pt_output
-mkdir -p hostfile
-sbatch run_train.sh
--- a/ptuning/mpirun_slurm/run_train.sh
+++ b/ptuning/mpirun_slurm/run_train.sh
-#!/bin/bash
-#SBATCH -p kshdnormal01
-#SBATCH -N 4
-#SBATCH --cpus-per-task=1
-#SBATCH --ntasks-per-node=32
-#SBATCH --mem 100G
-#SBATCH --gres=dcu:4
-#SBATCH -J chatglm
-#SBATCH -o logs/pt-%j.out
-#SBATCH -e logs/pt-%j.err
-ulimit -u 200000
-export OMP_NUM_THREADS=1
-export NCCL_DEBUG=INFO
-export MIOPEN_FIND_MODE=3
-export HSA_FORCE_FINE_GRAIN_PCIE=1
-export MIOPEN_COMPILE_PARALLEL_LEVEL=1
-export NCCL_PLUGIN_P2P=ucx
-export NCCL_SOCKET_IFNAME=ib0
-export NCCL_P2P_LEVEL=5
-export NCCL_NET_PLUGIN=none
-unset RCCL_NCHANNELS
-unset NCCL_NET_GDR_LEVEL
-rm -rf ./hostfile/*
-echo "START TIME: $(date)"
-hostfile=./hostfile/$SLURM_JOB_ID
-scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
-for i in `cat $hostfile`
-do
-    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
-done
-np=$(cat $hostfile|sort|uniq |wc -l)
-np=$(($np*4))
-nodename=$(cat $hostfile |sed -n "1p")
-dist_url=`echo $nodename | awk '{print $1}'`
-echo ${dist_url}
-mpirun -np $np --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run_train_single.sh $dist_url
--- a/ptuning/mpirun_slurm/run_train_single.sh
+++ b/ptuning/mpirun_slurm/run_train_single.sh
-#!/bin/bash
-export HSA_FORCE_FINE_GRAIN_PCIE=1
-export MIOPEN_FIND_MODE=3
-export MIOPEN_COMPILE_PARALLEL_LEVEL=1
-export NCCL_PLUGIN_P2P=ucx
-export RCCL_NCHANNELS=2
-export NCCL_SOCKET_IFNAME=ib0
-export NCCL_P2P_LEVEL=5
-export NCCL_IB_HCA=mlx5_0
-export NCCL_DEBUG=INFO
-export NCCL_NET_GDR_LEVEL=SYS
-export NCCL_NET_PLUGIN=none
-unset RCCL_NCHANNELS
-unset NCCL_NET_GDR_LEVEL
-lrank=$OMPI_COMM_WORLD_LOCAL_RANK
-echo "LRANK===============================$lrank"
-RANK=$OMPI_COMM_WORLD_RANK
-WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
-export HIP_VISIBLE_DEVICES=0,1,2,3
-LR=1e-5
-APP="python3 /public/home/zhaoying1/work/chatglm-main/ptuning/main-v1.py \
-    --deepspeed /public/home/zhaoying1/work/chatglm-main/ptuning/deepspeed.json \
-    --do_train \
-    --train_file /public/home/zhaoying1/work/chatglm-main/ptuning/sugon_md_word_faq.json \
-    --prompt_column prompt \
-    --response_column response \
-    --model_name_or_path /public/home/zhaoying1/work/model_scope/chatglm-6b \
-    --output_dir ./pt_output/pretrain \
-    --overwrite_output_dir \
-    --max_source_length 3 \
-    --max_target_length 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --predict_with_generate \
-    --max_steps 2000 \
-    --logging_steps 5 \
-    --save_steps 1000 \
-    --learning_rate $LR \
-    --fp16 \
-    --local_rank $lrank "
-case ${lrank} in
-[0])
-  export HIP_VISIBLE_DEVICES=0,1,2,3
-  export UCX_NET_DEVICES=mlx5_0:1
-  export UCX_IB_PCI_BW=mlx5_0:50Gbs
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[1])
-  export HIP_VISIBLE_DEVICES=0,1,2,3
-  export UCX_NET_DEVICES=mlx5_1:1
-  export UCX_IB_PCI_BW=mlx5_1:50Gbs
-  numactl --cpunodebind=1 --membind=1 ${APP}
-  ;;
-[2])
-  export HIP_VISIBLE_DEVICES=0,1,2,3
-  export UCX_NET_DEVICES=mlx5_2:1
-  export UCX_IB_PCI_BW=mlx5_2:50Gbs
-  numactl --cpunodebind=2 --membind=2 ${APP}
-  ;;
-[3])
-  export HIP_VISIBLE_DEVICES=0,1,2,3
-  export UCX_NET_DEVICES=mlx5_3:1
-  export UCX_IB_PCI_BW=mlx5_3:50Gbs
-  numactl --cpunodebind=3 --membind=3 ${APP}
-  ;;
-esac