single_process.sh 2.78 KB
Newer Older
chenych's avatar
chenych committed
1
#!/bin/bash
Rayyyyy's avatar
Rayyyyy committed
2
3
4
5
# NCCL相关的DEBUG信息显示
export NCCL_DEBUG=INFO
export NCCL_NET_PLUGIN=none

chenych's avatar
chenych committed
6
7
8
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1

Rayyyyy's avatar
Rayyyyy committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
export NCCL_P2P_LEVEL=5
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export NCCL_PLUGIN_P2P=ucx
export NCCL_SHM_DISABLE=1
export NCCL_IB_DISABLE=0 #不适用ib
export NCCL_IB_HCA=mlx5_0
export NCCL_CROSS_NIC=1
export RCCL_NCHANNELS=4
export MASTER_ADDR=$dist_url
export MASTER_PORT=4321
export RANK=$OMPI_COMM_WORLD_RANK

lrank=$OMPI_COMM_WORLD_LOCAL_RANK

chenych's avatar
chenych committed
23
24
DATA_PATH=/home/datasets
name=painter_vit_large
Rayyyyy's avatar
Rayyyyy committed
25
APP="python3 -u main_train.py  \
chenych's avatar
chenych committed
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
    --batch_size 2 \
    --accum_iter 16  \
    --model painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1 \
    --num_mask_patches 784 \
    --max_mask_patches_per_block 392 \
    --epochs 15 \
    --warmup_epochs 1 \
    --lr 1e-3 \
    --clip_grad 3 \
    --layer_decay 0.8 \
    --drop_path 0.1 \
    --input_size 896 448 \
    --save_freq 1 \
    --data_path $DATA_PATH/ \
    --json_path  \
    $DATA_PATH/nyu_depth_v2/nyuv2_sync_image_depth.json \
    $DATA_PATH/ade20k/ade20k_training_image_semantic.json \
    $DATA_PATH/coco/pano_ca_inst/coco_train_image_panoptic_inst.json \
    $DATA_PATH/coco/pano_sem_seg/coco_train2017_image_panoptic_sem_seg.json \
    $DATA_PATH/coco_pose/coco_pose_256x192_train.json \
    $DATA_PATH/denoise/denoise_ssid_train.json \
    $DATA_PATH/derain/derain_train.json \
    $DATA_PATH/light_enhance/enhance_lol_train.json \
    --val_json_path \
    $DATA_PATH/nyu_depth_v2/nyuv2_test_image_depth.json \
    $DATA_PATH/ade20k/ade20k_validation_image_semantic.json \
    $DATA_PATH/coco/pano_ca_inst/coco_val_image_panoptic_inst.json \
    $DATA_PATH/coco/pano_sem_seg/coco_val2017_image_panoptic_sem_seg.json \
    $DATA_PATH/coco_pose/coco_pose_256x192_val.json \
    $DATA_PATH/denoise/denoise_ssid_val.json \
    $DATA_PATH/derain/derain_test_rain100h.json \
    $DATA_PATH/light_enhance/enhance_lol_val.json \
    --output_dir models/$name \
    --log_dir models/$name/logs \
Rayyyyy's avatar
Rayyyyy committed
60
61
    --finetune path/to/mae_pretrain_vit_large.pth
"
chenych's avatar
chenych committed
62

Rayyyyy's avatar
Rayyyyy committed
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
case $(expr $lrank % 4) in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac