model: name_or_path: allenai/Molmo-7B-O-0924 arch: causal use_flash_attn: true wandb: project: pdelfin entity: ai2-llm generate: max_length: 4096 train_data: seed: 1337 cache_location: /data/jakep/pdfdata/pdelfin_cache sources: - name: openai_batch_data_v5_1_train response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json target_longest_image_dim: [1024] target_anchor_text_len: [6000] - name: openai_batch_data_v5_1_iabooks_train response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json target_longest_image_dim: [1024] target_anchor_text_len: [6000] valid_data: cache_location: /data/jakep/pdfdata/pdelfin_cache metric_for_best_model: openai_batch_data_v5_1_eval_loss sources: # These tend to be small, so you can load from s3 it's no big deal - name: openai_batch_data_v5_1_eval response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json target_longest_image_dim: [1024] target_anchor_text_len: [6000] - name: openai_batch_data_v5_1_eval response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json target_longest_image_dim: [1024] target_anchor_text_len: [6000] # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh hparams: batch_size: 1 eval_batch_size: 1 gradient_accumulation_steps: 4 gradient_checkpointing: true find_unused_parameters: true clip_grad_norm: 1.0 learning_rate: 1e-4 max_steps: 10000 pad_multiple_of: 16 log_every_steps: 10 eval_every_steps: 100 optim: adamw_torch lr_scheduler: cosine weight_decay: 0.01 warmup_ratio: 0.03 # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py lora: rank: 32 alpha: 32 dropout: 0.05 task_type: CAUSAL_LM target_modules: # attention layers in main transformer - att_proj - ff_proj - attn_out - ff_out # vision transformer attention and FF - attention.wq - attention.wk - attention.wv - attention.wo - feed_forward.w1 - feed_forward.w2 # vision image projector - vision_backbone.image_projector.w1 - vision_backbone.image_projector.w2 - vision_backbone.image_projector.w3 save: path: s3://ai2-oe-data/jakep/experiments/molmo-o-0924/v1/models/ save_every_steps: 1000 max_workers: 10