resources:
  accelerators: A100-80GB:4
  disk_size: 1000

num_nodes: 1

file_mounts:
  /artifacts:
    name: skypilot-chatbot # Change to your own bucket
    store: gcs
    mode: MOUNT
  /data/alpaca-data-conversation.json: chatserver/data/example/alpaca-data-conversation.json
  # /lamma:
  #   name: llama-ckpts # Change to the bucket that contains the LLaMA weights
  #   store: gcs
  #   mode: MOUNT

workdir: .

setup: |
  # Setup the environment
  conda create -n chatbot python=3.10 -y
  conda activate chatbot

  # Install pytorch
  pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116

  # Install huggingface with the LLaMA commit
  cd ~
  git clone https://github.com/huggingface/transformers.git
  cd transformers
  git checkout 41a2f3529c6b56866c317031375ffd3e7b8bea01
  pip install .
  cd ~/sky_workdir

  # Install fastchat
  pip install -e .

  mkdir -p /artifacts/llama-hf/llama-${MODEL_SIZE}B
  if [ ! -f /artifacts/llama-hf/llama-${MODEL_SIZE}B/complete ]; then
    mkdir -p ~/llama-${MODEL_SIZE}b
    gsutil -m rsync -r /llama/${MODEL_SIZE}b/ ~/llama-${MODEL_SIZE}b
    cd ~/transformers
    python src/transformers/models/llama/convert_llama_weights_to_hf.py \
      --input_dir $HOME/llama-${MODEL_SIZE}b \
      --model_size ${MODEL_SIZE}B \
      --output_dir ~/hf-output || exit 1
    mv ~/hf-output/tokenizer/* ~/hf-output/llama-${MODEL_SIZE}b
    gsutil -m rsync -r ~/hf-output/llama-${MODEL_SIZE}b/ /artifacts/llama-hf/llama-${MODEL_SIZE}B
    touch /artifacts/llama-hf/llama-${MODEL_SIZE}B/complete
  else
    mkdir -p ~/hf-output/llama-${MODEL_SIZE}b
    gsutil -m cp -r /artifacts/llama-hf/llama-${MODEL_SIZE}B/* ~/hf-output/llama-${MODEL_SIZE}b
  fi

run: |
  conda activate chatbot
  SEQ_LEN=${SEQ_LEN:-512}
  echo "Training with seq_len=${SEQ_LEN} and model_size=${MODEL_SIZE}B"
  PER_DEVICE_BATCH_SIZE=$((2048 / $SEQ_LEN))
  NUM_NODES=`echo "$SKYPILOT_NODE_IPS" | wc -l`
  HOST_ADDR=`echo "$SKYPILOT_NODE_IPS" | head -n1`

  # Hack copy it once to make it faster later
  mkdir -p ~/.checkpoints
  CKPT_PATH=/artifacts/chatbot/${MODEL_SIZE}b/alpaca-${SEQ_LEN}
  last_ckpt=$(ls ${CKPT_PATH} | grep -E '[0-9]+' | sort -n | tail -1)
  gsutil -m rsync -r ${CKPT_PATH}/${last_ckpt}/ ~/.checkpoints 
  
  torchrun \
    --nnodes=$NUM_NODES \
    --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
    --master_port=12375 \
    --master_addr=$HOST_ADDR \
    --node_rank=${SKYPILOT_NODE_RANK} \
    fastchat/train/train.py \
    --model_name_or_path ~/hf-output/llama-${MODEL_SIZE}b \
    --data_path /data/alpaca-data-conversation.json \
    --bf16 True \
    --output_dir $CKPT_PATH \
    --num_train_epochs 3 \
    --per_device_train_batch_size $PER_DEVICE_BATCH_SIZE \
    --per_device_eval_batch_size $PER_DEVICE_BATCH_SIZE \
    --gradient_accumulation_steps $((128 * 512 / $SEQ_LEN / $PER_DEVICE_BATCH_SIZE / $NUM_NODES / $SKYPILOT_NUM_GPUS_PER_NODE)) \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 2000 \
    --save_total_limit 1 \
    --learning_rate 2e-5 \
    --weight_decay 0. \
    --warmup_ratio 0.03 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --fsdp "full_shard auto_wrap" \
    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
    --tf32 True \
    --model_max_length ${SEQ_LEN}


envs:
  MODEL_SIZE: 7
  SEQ_LEN: 512