train_graph.sh 1.05 KB
Newer Older
yuguo960516's avatar
yuguo960516 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# set -aux

export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED

CHECKPOINT_SAVE_PATH="./graph_checkpoints"
if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
    mkdir $CHECKPOINT_SAVE_PATH
fi

OFRECORD_PATH="./mini-imagenet/ofrecord"

if [ ! -d "$OFRECORD_PATH" ]; then
    wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip
    unzip mini-imagenet.zip
fi

OFRECORD_PART_NUM=8
LEARNING_RATE=0.256
MOM=0.875
EPOCH=90
TRAIN_BATCH_SIZE=128
VAL_BATCH_SIZE=128

# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)

python3 $SRC_DIR/train.py \
    --ofrecord-path $OFRECORD_PATH \
    --ofrecord-part-num $OFRECORD_PART_NUM \
    --num-devices-per-node 1 \
    --lr $LEARNING_RATE \
    --momentum $MOM \
    --num-epochs $EPOCH \
    --warmup-epochs 0 \
    --train-batch-size $TRAIN_BATCH_SIZE \
    --val-batch-size $VAL_BATCH_SIZE \
    --save $CHECKPOINT_SAVE_PATH \
    --samples-per-epoch 128 \
    --val-samples-per-epoch 128 \
    --scale-grad \
    --graph \
    --skip-eval \