"...googletest-src/googletest/samples/sample2_unittest.cc" did not exist on "395d2ce606314a6729939084e5f492f37cd2ff13"
train_sft.sh 859 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
set_n_least_used_CUDA_VISIBLE_DEVICES() {
    local n=${1:-"9999"}
    echo "GPU Memory Usage:"
    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
        tail -n +2 |
        nl -v 0 |
        tee /dev/tty |
        sort -g -k 2 |
        awk '{print $1}' |
        head -n $n)
    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
    echo "Now CUDA_VISIBLE_DEVICES is set to:"
    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
}

set_n_least_used_CUDA_VISIBLE_DEVICES 4

Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
18
19
20
21
torchrun --standalone --nproc_per_node=4 train_sft.py \
    --pretrain "/path/to/LLaMa-7B/" \
    --model 'llama' \
    --strategy colossalai_zero2 \
22
    --save_path /path/to/Coati-7B \
Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
23
24
    --dataset /path/to/data.json \
    --batch_size 4 \
25
    --accumulation_steps 8 \
Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
26
27
    --lr 2e-5 \
    --max_datasets_size 512 \
YeAnbang's avatar
YeAnbang committed
28
    --max_epochs 1