Commit 1bfbcff0 authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #1204 canceled with stages
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# torchacc fsdp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
--model_layer_cls_name BaichuanLayer \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# MASTER_ADDR=127.0.0.1 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 2 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# torchacc dp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
swift sft \
--model_id_or_path ZhipuAI/chatglm3-6b \
--model_layer_cls_name GLMBlock \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# torchacc fsdp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path ZhipuAI/chatglm3-6b \
--model_layer_cls_name GLMBlock \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# MASTER_ADDR=127.0.0.1 \
# MASTER_PORT=12356 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path ZhipuAI/chatglm3-6b \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 4 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
swift sft \
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 24 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# MASTER_ADDR=127.0.0.1 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
export XLA_COORDINATOR_PORT=12457
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=21779 \
swift sft \
--model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama3 \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
# export XLA_COORDINATOR_PORT=12457
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
swift sft \
--model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama3 \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# MASTER_ADDR=127.0.0.1 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
--dataset codefuse-python-en \
--template_type llama3 \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
# Experimental environment: 4 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
# export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=2,3 \
MASTER_PORT=23797 \
swift sft \
--model_type qwen1half-14b-chat \
--model_layer_cls_name Qwen2DecoderLayer \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 8 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 1 \
--report_to 'none'
# Experimental environment: 4 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
DEBUG_PREFIX=qwen15_14b
DEBUG_PATH=torchacc_debug/qwen15/
export USE_TORCHACC=1
# export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
MASTER_PORT=23783 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_type qwen1half-14b-chat \
--model_layer_cls_name Qwen2DecoderLayer \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
# Experimental environment: 4 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# MASTER_ADDR=127.0.0.1 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_type qwen1half-14b-chat \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 4 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
# Experimental environment: 4 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
# export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model_type qwen1half-32b-chat \
--model_layer_cls_name Qwen2DecoderLayer \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 4 \
--report_to 'none'
# Experimental environment: 4 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# MASTER_ADDR=127.0.0.1 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model_type qwen1half-32b-chat \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 1 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
# Experimental environment: 4 * 8*A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export XLA_FLAGS='--xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.97
# Note: You need to set the correct MASTER_ADDR, MASTER_PORT and NODE_RANK for each node.
MASTER_ADDR=127.0.0.1 \
MASTER_PORT=12456 \
NODE_RANK=0 \
NNODES=4 \
NPROC_PER_NODE=8 \
swift sft \
--model_type qwen-72b-chat \
--model_layer_cls_name QWenBlock \
--dataset codefuse-python-en \
--sft_type full \
--output_dir output \
--num_train_epochs 1 \
--max_length 1024 \
--batch_size 1 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--eval_steps 200 \
--save_steps 200 \
--logging_steps 100 \
--metric_warmup_step 0.1 \
--report_to 'none'
--fsdp_num 32
# Experimental environment: 4 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model_type qwen-72b-chat \
--model_layer_cls_name QWenBlock \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output_qwen_72b \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 4 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--eval_steps 200 \
--save_steps 200 \
--logging_steps 100 \
--metric_warmup_step 0.1 \
--report_to 'none' \
--fsdp_num 4 \
# Experimental environment: 4 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# MASTER_ADDR=127.0.0.1 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model_id_or_path qwen/Qwen-72B-Chat \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 1 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
# Experimental environment: 4 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
--model_type yi-34b-chat \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 4 \
--report_to 'none'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment