model: name_or_path: Qwen/Qwen2-VL-2B-Instruct arch: causal use_flash_attn: true wandb: project: pdelfin entity: ai2-llm # TODO This is not used format: instruction_template: "Original:" response_template: "Rewritten:" # Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30 chat_template: | {% for message in messages %} {{'<|im_start|>' + message['role'] + '\n' + message['content']}} {% if loop.last %} {{ '<|im_end|>'}} {% else %} {{ '<|im_end|>\n' }} {% endif %} {% endfor %} generate: max_length: 4096 train_data: seed: 1337 sources: - name: openai_batch_data_v2 query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json backend: - openai size: 100_000 valid_data: sources: - name: openai_batch_data_eval_mini query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_eval_mini/*.jsonl response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_eval_mini/*.json backend: - openai size: 100_000 # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh hparams: batch_size: 1 eval_batch_size: 1 gradient_accumulation_steps: 4 gradient_checkpointing: false clip_grad_norm: 1.0 learning_rate: 3e-4 max_steps: 2000 pad_multiple_of: 16 log_every_steps: 50 eval_every_steps: 1000 optim: adamw_torch lr_scheduler: cosine weight_decay: 0.01 warmup_ratio: 0.03 # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py lora: rank: 32 alpha: 32 dropout: 0.05 task_type: causal_lm target_modules: - q_proj - k_proj - v_proj - o_proj - gate_proj - up_proj - down_proj - visual.blocks.[0-9]+.attn.qkv - visual.blocks.[0-9]+.attn.proj - visual.blocks.[0-9]+.mlp.fc1 - visual.blocks.[0-9]+.mlp.fc2 - visual.merger.mlp.0 - visual.merger.mlp.2 save: path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/ save_every_steps: 1000 max_workers: 10