train_config_8M.yaml 4.76 KB
Newer Older
raojy's avatar
first  
raojy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
trainer_type: fsdp2_trainer
dataset_config:
  extra_kwargs: {}
  dataset_type: qwen3_vl_iterable
  dataset_format: yaml
  processor_config:
    processor_name: Qwen/Qwen3-VL-8B-Instruct
    processor_type: qwen3_vl
  dataset_path: training/qwen3_vl/data_8M.yaml
  datasets: null
  shuffle: true
  eval_dataset_path: null
  object_storage: none
  bucket_name: null
  packing: false
  packing_strategy: first_fit
  packing_length: 40000
  filter_overlong: true
  filter_overlong_workers: 8
  max_length: null
  video_sampling_strategy: fps
  video_max_pixels: 50176
  video_max_frames: 512
  frame_num: 64
  fps: 1
  video_backend: qwen_vl_utils
trainer_args:
  output_dir: ./results/qwen3_vl/sensenova_si_8M
  overwrite_output_dir: false
  do_train: false
  do_eval: false
  do_predict: false
  eval_strategy: 'no'
  prediction_loss_only: false
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  per_gpu_train_batch_size: null
  per_gpu_eval_batch_size: null
  gradient_accumulation_steps: 1
  eval_accumulation_steps: null
  eval_delay: 0
  torch_empty_cache_steps: null
  learning_rate: 1.0e-05
  weight_decay: 0.0
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  max_grad_norm: 1.0
  num_train_epochs: 1
  max_steps: 1000
  lr_scheduler_type: cosine
  lr_scheduler_kwargs: {}
  warmup_ratio: 0.01
  warmup_steps: 0
  log_level: passive
  log_level_replica: warning
  log_on_each_node: true
  logging_dir: ./output/qwen3_vl_training/runs
  logging_strategy: steps
  logging_first_step: false
  logging_steps: 1
  logging_nan_inf_filter: true
  save_strategy: steps
  save_steps: 200
  save_total_limit: 1
  save_safetensors: true
  save_on_each_node: false
  save_only_model: false
  restore_callback_states_from_checkpoint: false
  no_cuda: false
  use_cpu: false
  use_mps_device: false
  seed: 42
  data_seed: null
  jit_mode_eval: false
  bf16: true
  fp16: false
  fp16_opt_level: O1
  half_precision_backend: auto
  bf16_full_eval: false
  fp16_full_eval: false
  tf32: null
  local_rank: 0
  ddp_backend: null
  tpu_num_cores: null
  tpu_metrics_debug: false
  debug: []
  dataloader_drop_last: false
  eval_steps: null
  dataloader_num_workers: 0
  dataloader_prefetch_factor: null
  past_index: -1
  run_name: video_debug
  disable_tqdm: false
  remove_unused_columns: true
  label_names: null
  load_best_model_at_end: false
  metric_for_best_model: null
  greater_is_better: null
  ignore_data_skip: false
  fsdp: []
  fsdp_min_num_params: 0
  fsdp_config:
    transformer_layer_cls_to_wrap:
    - Qwen3VLTextDecoderLayer
    reshard_after_forward: false
    min_num_params: 0
    xla: false
    xla_fsdp_v2: false
    xla_fsdp_grad_ckpt: false
  fsdp_transformer_layer_cls_to_wrap: null
  accelerator_config:
    split_batches: false
    dispatch_batches: null
    even_batches: true
    use_seedable_sampler: true
    non_blocking: false
    gradient_accumulation_kwargs: null
  parallelism_config: null
  deepspeed: null
  label_smoothing_factor: 0.0
  optim: adamw_torch_fused
  optim_args: null
  adafactor: false
  group_by_length: false
  length_column_name: length
  report_to: []
  project: huggingface
  trackio_space_id: trackio
  ddp_find_unused_parameters: null
  ddp_bucket_cap_mb: null
  ddp_broadcast_buffers: null
  dataloader_pin_memory: true
  dataloader_persistent_workers: false
  skip_memory_metrics: true
  use_legacy_prediction_loop: false
  push_to_hub: false
  resume_from_checkpoint: null
  hub_model_id: null
  hub_strategy: every_save
  hub_token: <HUB_TOKEN>
  hub_private_repo: null
  hub_always_push: false
  hub_revision: null
  gradient_checkpointing: true
  gradient_checkpointing_kwargs: null
  include_inputs_for_metrics: false
  include_for_metrics: []
  eval_do_concat_batches: true
  fp16_backend: auto
  push_to_hub_model_id: null
  push_to_hub_organization: null
  mp_parameters: ''
  auto_find_batch_size: false
  full_determinism: false
  torchdynamo: null
  ray_scope: last
  ddp_timeout: 1800
  torch_compile: false
  torch_compile_backend: null
  torch_compile_mode: null
  include_tokens_per_second: false
  include_num_input_tokens_seen: 'no'
  neftune_noise_alpha: null
  optim_target_modules: null
  batch_eval_metrics: false
  eval_on_start: false
  use_liger_kernel: true
  liger_kernel_config: null
  eval_use_gather_object: false
  average_tokens_across_devices: true
  use_muon: false
  freeze_modules: null
  use_rmpad: true
  fsdp2: true
  sp_ulysses_degree: 1
  reduce_dtype: bfloat16
  output_dtype: bfloat16
  print_batch_input_steps: 5
  enable_profiler: false
  profiler_config:
    start_step: 1
    end_step: 3
model_config:
  extra_kwargs: {}
  load_from_pretrained_path: training/pretrained_models/Qwen/Qwen3-VL-8B-Instruct
  load_from_config: null
  attn_implementation: flash_attention_2
  overwrite_config: null
  monkey_patch_kwargs: null
extra_kwargs: null