Initial commit

1bfbcff0 · wanglch · 1bfbcff0 · 1bfbcff0 · 1bfbcff0 · 1bfbcff0
Commit 1bfbcff0 authored Jun 13, 2024 by wanglch
20 changed files
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_fsdp_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc fsdp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
+  --model_layer_cls_name BaichuanLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 2 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_dp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_dp_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc dp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --model_layer_cls_name GLMBlock \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_fsdp_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc fsdp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --model_layer_cls_name GLMBlock \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+# MASTER_PORT=12356 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  4 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_dp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_dp_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_fsdp_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 24 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_dp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_dp_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+export XLA_COORDINATOR_PORT=12457
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=21779 \
+swift sft \
+  --model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama3 \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+# export XLA_COORDINATOR_PORT=12457
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama3 \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
+  --dataset codefuse-python-en \
+  --template_type llama3 \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/acc_lora_dp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/acc_lora_dp_sft.sh
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+# export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=1000000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=2,3 \
+MASTER_PORT=23797 \
+swift sft \
+--model_type qwen1half-14b-chat \
+  --model_layer_cls_name Qwen2DecoderLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 8 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 1 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/acc_lora_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/acc_lora_fsdp_sft.sh
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+DEBUG_PREFIX=qwen15_14b
+DEBUG_PATH=torchacc_debug/qwen15/
+export USE_TORCHACC=1
+# export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=1000000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+MASTER_PORT=23783 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+--model_type qwen1half-14b-chat \
+  --model_layer_cls_name Qwen2DecoderLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/swift_lora_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/swift_lora_sft.sh
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_type qwen1half-14b-chat \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  4 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_32b_chat/acc_lora_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_32b_chat/acc_lora_fsdp_sft.sh
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+# export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=1000000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+--model_type qwen1half-32b-chat \
+  --model_layer_cls_name Qwen2DecoderLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 4 \
+  --report_to 'none'
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_32b_chat/swift_lora_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/qwen1half_32b_chat/swift_lora_sft.sh
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+  --model_type qwen1half-32b-chat \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  1 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/acc_full_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/acc_full_fsdp_sft.sh
+# Experimental environment: 4 * 8*A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.97
+
+# Note: You need to set the correct MASTER_ADDR, MASTER_PORT and NODE_RANK for each node.
+
+MASTER_ADDR=127.0.0.1 \
+MASTER_PORT=12456 \
+NODE_RANK=0 \
+NNODES=4 \
+NPROC_PER_NODE=8 \
+swift sft \
+    --model_type qwen-72b-chat \
+    --model_layer_cls_name QWenBlock \
+    --dataset codefuse-python-en \
+    --sft_type full \
+    --output_dir output \
+    --num_train_epochs 1 \
+    --max_length 1024 \
+    --batch_size 1 \
+    --use_flash_attn true \
+    --gradient_accumulation_steps 1 \
+    --gradient_checkpointing no \
+    --tuner_backend 'peft' \
+    --eval_steps 200 \
+    --save_steps 200 \
+    --logging_steps 100 \
+    --metric_warmup_step 0.1 \
+    --report_to 'none'
+    --fsdp_num 32
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/acc_lora_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/acc_lora_fsdp_sft.sh
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+    --model_type qwen-72b-chat \
+    --model_layer_cls_name QWenBlock \
+    --dataset codefuse-python-en \
+    --sft_type lora \
+    --output_dir output_qwen_72b \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --batch_size 4 \
+    --use_flash_attn true \
+    --gradient_accumulation_steps 1 \
+    --gradient_checkpointing no \
+    --tuner_backend 'peft' \
+    --eval_steps 200 \
+    --save_steps 200 \
+    --logging_steps 100 \
+    --metric_warmup_step 0.1 \
+    --report_to 'none' \
+    --fsdp_num 4 \
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+  --model_id_or_path qwen/Qwen-72B-Chat \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  1 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
--- a/swift-main/examples/pytorch/llm/scripts/torchacc/yi_34b_chat/acc_lora_fsdp_sft.sh
+++ b/swift-main/examples/pytorch/llm/scripts/torchacc/yi_34b_chat/acc_lora_fsdp_sft.sh
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=1000000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+--model_type yi-34b-chat \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 4 \
+  --report_to 'none'