resources: accelerators: A100-80GB:4 disk_size: 1000 num_nodes: 1 file_mounts: /artifacts: name: skypilot-chatbot # Change to your own bucket store: gcs mode: MOUNT /data/alpaca-data-conversation.json: chatserver/data/example/alpaca-data-conversation.json # /lamma: # name: llama-ckpts # Change to the bucket that contains the LLaMA weights # store: gcs # mode: MOUNT workdir: . setup: | # Setup the environment conda create -n chatbot python=3.10 -y conda activate chatbot # Install pytorch pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 # Install huggingface with the LLaMA commit cd ~ git clone https://github.com/huggingface/transformers.git cd transformers git checkout 41a2f3529c6b56866c317031375ffd3e7b8bea01 pip install . cd ~/sky_workdir # Install fastchat pip install -e . mkdir -p /artifacts/llama-hf/llama-${MODEL_SIZE}B if [ ! -f /artifacts/llama-hf/llama-${MODEL_SIZE}B/complete ]; then mkdir -p ~/llama-${MODEL_SIZE}b gsutil -m rsync -r /llama/${MODEL_SIZE}b/ ~/llama-${MODEL_SIZE}b cd ~/transformers python src/transformers/models/llama/convert_llama_weights_to_hf.py \ --input_dir $HOME/llama-${MODEL_SIZE}b \ --model_size ${MODEL_SIZE}B \ --output_dir ~/hf-output || exit 1 mv ~/hf-output/tokenizer/* ~/hf-output/llama-${MODEL_SIZE}b gsutil -m rsync -r ~/hf-output/llama-${MODEL_SIZE}b/ /artifacts/llama-hf/llama-${MODEL_SIZE}B touch /artifacts/llama-hf/llama-${MODEL_SIZE}B/complete else mkdir -p ~/hf-output/llama-${MODEL_SIZE}b gsutil -m cp -r /artifacts/llama-hf/llama-${MODEL_SIZE}B/* ~/hf-output/llama-${MODEL_SIZE}b fi run: | conda activate chatbot SEQ_LEN=${SEQ_LEN:-512} echo "Training with seq_len=${SEQ_LEN} and model_size=${MODEL_SIZE}B" PER_DEVICE_BATCH_SIZE=$((2048 / $SEQ_LEN)) NUM_NODES=`echo "$SKYPILOT_NODE_IPS" | wc -l` HOST_ADDR=`echo "$SKYPILOT_NODE_IPS" | head -n1` # Hack copy it once to make it faster later mkdir -p ~/.checkpoints CKPT_PATH=/artifacts/chatbot/${MODEL_SIZE}b/alpaca-${SEQ_LEN} last_ckpt=$(ls ${CKPT_PATH} | grep -E '[0-9]+' | sort -n | tail -1) gsutil -m rsync -r ${CKPT_PATH}/${last_ckpt}/ ~/.checkpoints torchrun \ --nnodes=$NUM_NODES \ --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ --master_port=12375 \ --master_addr=$HOST_ADDR \ --node_rank=${SKYPILOT_NODE_RANK} \ fastchat/train/train.py \ --model_name_or_path ~/hf-output/llama-${MODEL_SIZE}b \ --data_path /data/alpaca-data-conversation.json \ --bf16 True \ --output_dir $CKPT_PATH \ --num_train_epochs 3 \ --per_device_train_batch_size $PER_DEVICE_BATCH_SIZE \ --per_device_eval_batch_size $PER_DEVICE_BATCH_SIZE \ --gradient_accumulation_steps $((128 * 512 / $SEQ_LEN / $PER_DEVICE_BATCH_SIZE / $NUM_NODES / $SKYPILOT_NUM_GPUS_PER_NODE)) \ --evaluation_strategy "no" \ --save_strategy "steps" \ --save_steps 2000 \ --save_total_limit 1 \ --learning_rate 2e-5 \ --weight_decay 0. \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --fsdp "full_shard auto_wrap" \ --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \ --tf32 True \ --model_max_length ${SEQ_LEN} envs: MODEL_SIZE: 7 SEQ_LEN: 512