Support GLM-4/GLM-4-0414/GLM-Z1

581d366d · chenych · 428c5813 · 581d366d · 581d366d · 581d366d
Commit 581d366d authored Apr 15, 2025 by chenych
20 changed files
--- a/docker/docker-rocm/Dockerfile
+++ b/docker/docker-rocm/Dockerfile
@@ -12,8 +12,13 @@ ARG INSTALL_DEEPSPEED=false
 ARG INSTALL_FLASHATTN=false
 ARG INSTALL_LIGER_KERNEL=false
 ARG INSTALL_HQQ=false
+ARG INSTALL_PYTORCH=true
 ARG PIP_INDEX=https://pypi.org/simple
 ARG HTTP_PROXY=
+ARG PYTORCH_INDEX=https://download.pytorch.org/whl/nightly/rocm6.3
+
+# Use Bash instead of default /bin/sh
+SHELL ["/bin/bash", "-c"]

 # Set the working directory
 WORKDIR /app
@@ -62,6 +67,13 @@ RUN EXTRA_PACKAGES="metrics"; \
        pip install -e ".[$EXTRA_PACKAGES]"; \
    fi

+# Reinstall pytorch
+# This is necessary to ensure that the correct version of PyTorch is installed
+RUN if [ "$INSTALL_PYTORCH" == "true" ]; then \
+        pip uninstall -y torch torchvision torchaudio && \
+        pip install --pre torch torchvision torchaudio --index-url "$PYTORCH_INDEX"; \
+    fi
+
 # Rebuild flash attention
 RUN pip uninstall -y transformer-engine flash-attn && \
    if [ "$INSTALL_FLASHATTN" == "true" ]; then \

--- a/docker/docker-rocm/docker-compose.yml
+++ b/docker/docker-rocm/docker-compose.yml
@@ -9,8 +9,10 @@ services:
        INSTALL_DEEPSPEED: "false"
        INSTALL_FLASHATTN: "false"
        INSTALL_LIGER_KERNEL: "false"
+        INSTALL_PYTORCH: "true"
        INSTALL_HQQ: "false"
        PIP_INDEX: https://pypi.org/simple
+        PYTORCH_INDEX: https://download.pytorch.org/whl/nightly/rocm6.3
    container_name: llamafactory
    volumes:
      - ../../hf_cache:/root/.cache/huggingface

--- a/examples/extras/adam_mini/qwen2_full_sft.yaml
+++ b/examples/extras/adam_mini/qwen2_full_sft.yaml
@@ -15,6 +15,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/qwen2-1_5b/full/sft
@@ -22,6 +23,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/extras/apollo/llama3_full_sft.yaml
+++ b/examples/extras/apollo/llama3_full_sft.yaml
@@ -20,6 +20,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/full/sft
@@ -27,6 +28,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/extras/badam/llama3_full_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
@@ -20,6 +20,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/full/sft
@@ -27,6 +28,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -17,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/lora/sft
@@ -24,6 +25,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -19,6 +19,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/full/sft
@@ -26,6 +27,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -17,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b-pro/freeze/sft
@@ -24,6 +25,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -17,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/lora/sft
@@ -24,6 +25,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -15,6 +15,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b-mod/full/sft
@@ -22,6 +23,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/extras/nlg_eval/llama3_lora_predict.yaml
+++ b/examples/extras/nlg_eval/llama3_lora_predict.yaml
@@ -18,10 +18,12 @@ cutoff_len: 2048
 max_samples: 50
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/lora/predict
 overwrite_output_dir: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### eval
 per_device_eval_batch_size: 1

--- a/examples/extras/pissa/llama3_lora_sft.yaml
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
@@ -19,6 +19,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/lora/sft
@@ -26,6 +27,8 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/train_full/llama3_full_sft.yaml
+++ b/examples/train_full/llama3_full_sft.yaml
@@ -24,6 +24,7 @@ save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/train_full/qwen2vl_full_sft.yaml
+++ b/examples/train_full/qwen2vl_full_sft.yaml
@@ -8,10 +8,10 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: full
-freeze_vision_tower: true  # choices: [true, false]
-freeze_multi_modal_projector: true  # choices: [true, false]
-freeze_language_model: false  # choices: [true, false]
-deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
+freeze_vision_tower: true
+freeze_multi_modal_projector: true
+freeze_language_model: false
+deepspeed: examples/deepspeed/ds_z3_config.json

 ### dataset
 dataset: mllm_demo,identity,alpaca_en_demo
@@ -29,6 +29,7 @@ save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -27,6 +27,7 @@ save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/train_lora/llama3_lora_kto.yaml
+++ b/examples/train_lora/llama3_lora_kto.yaml
@@ -17,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/lora/kto
@@ -24,6 +25,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/train_lora/llama3_lora_ppo.yaml
+++ b/examples/train_lora/llama3_lora_ppo.yaml
@@ -17,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/lora/ppo
@@ -24,6 +25,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
@@ -24,6 +24,7 @@ save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/train_lora/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
@@ -25,6 +25,7 @@ save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1

--- a/examples/train_lora/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
@@ -25,6 +25,7 @@ save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

 ### train
 per_device_train_batch_size: 1