run_qwen.sh

#!/bin/bash

torchrun --nnodes=1 --nproc-per-node=4 training.py \
    --model_name "Qwen/Qwen2-7B" \
    --bf16 \
    --num_train_epochs 1 \
    --per_device_train_batch_size 48 \
    --per_device_eval_batch_size 64 \
    --eval_strategy "no" \
    --save_strategy "no" \
    --learning_rate 6e-6 \
    --weight_decay 0.05 \
    --warmup_ratio 0.1 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --include_num_input_tokens_seen \
    --report_to none \
    --fsdp "full_shard auto_wrap" \
    --fsdp_config config/fsdp_config.json \
    --seed 42 \
    --use_liger True \
    --output_dir alpaca_finetuning