train_sensenova_si_8M.sh 2.55 KB
Newer Older
raojy's avatar
first  
raojy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/bin/bash
set -x

# ============================================================
# SenseNova-SI-8M Training with BAGEL-7B-MoT
# ============================================================
#
# Step 1: Download the dataset from Hugging Face:
#   pip install huggingface_hub
#   python -c "
#   from huggingface_hub import snapshot_download
#   snapshot_download(
#       repo_id='sensenova/SenseNova-SI-8M',
#       repo_type='dataset',
#       local_dir='data/SenseNova-SI-8M',
#   )
#   "
#
# Step 2: Download the BAGEL-7B-MoT model:
#   python -c "
#   from huggingface_hub import snapshot_download
#   snapshot_download(
#       repo_id='ByteDance-Seed/BAGEL-7B-MoT',
#       local_dir='models/BAGEL-7B-MoT',
#   )
#   "
#
# Step 3: Run this script from the bagel-train directory.
# ============================================================

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_DIR="$(python -c 'import pathlib; print(pathlib.Path("'"${SCRIPT_DIR}"'").parent.resolve())')"
TRAINING_ROOT="$(python -c 'import pathlib; print(pathlib.Path("'"${REPO_DIR}"'").parent.resolve())')"
MODEL_PATH="${MODEL_PATH:-${TRAINING_ROOT}/pretrained_models/BAGEL-7B-MoT}"
NPROC_PER_NODE=${NPROC_PER_NODE:-8}
NNODES=${NNODES:-1}
RESULTS_DIR="${RESULTS_DIR:-${TRAINING_ROOT}/results/bagel/sensenova_si_8M}"
DATASET_CONFIG_FILE="${DATASET_CONFIG_FILE:-${REPO_DIR}/data/configs/sensenova_si_8M.yaml}"

export PYTHONPATH="${PYTHONPATH}:${REPO_DIR}"
export TRAINING_ROOT="${TRAINING_ROOT}"

cd "${REPO_DIR}"

torchrun \
    --nproc_per_node ${NPROC_PER_NODE} \
    --nnodes ${NNODES} \
    --master-addr ${MASTER_ADDR:-127.0.0.1} \
    --master-port ${MASTER_PORT:-29500} \
    --node-rank ${RANK:-0} \
    "${REPO_DIR}/train/pretrain_unified_navit.py" \
        --dataset_config_file "${DATASET_CONFIG_FILE}" \
        --model_name BAGEL \
        --resume_from_hf ${MODEL_PATH} \
        --layer_module Qwen2MoTDecoderLayer \
        --max_latent_size 64 \
        --resume-from ${MODEL_PATH} \
        --auto_resume True \
        --resume-model-only True \
        --finetune-from-ema True \
        --log_every 1 \
        --lr 1e-6 \
        --num_worker 0 \
        --prefetch_factor 1 \
        --expected_num_tokens 32768 \
        --max_num_tokens 70000 \
        --max_num_tokens_per_sample 50000 \
        --freeze_vit True \
        --visual_und True \
        --visual_gen False \
        --results_dir ${RESULTS_DIR} \
        --checkpoint_dir ${RESULTS_DIR}/checkpoints \
        --total_steps 50000 \
        --save_every 1 \
        --wandb_offline True \
        --num_shard ${NPROC_PER_NODE}