try.sh 683 Bytes
Newer Older
liangjing's avatar
liangjing committed
1
2
3
4
5
6
7
8
9
source config_DGXA100_001x08x032.sh
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export HSA_FORCE_FINE_GRAIN_PCIE=1
#export MIOPEN_FIND_MODE=5
#export NCCL_NET_GDR_LEVEL=5
#export NCCL_P2P_LEVEL=5

torchrun --nproc_per_node=8 train.py --lr 0.000085 --batch-size 18 --eval-batch-size 32 --epochs 1 --print-freq 20 --dataset-path /public/home/liangjj/2023/training_results_v2.1-main/NVIDIA/benchmarks/ssd/implementations/pytorch-22.09/public-scripts/datasets/open-images-v6 --warmup-epochs 0  --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco
#torchrun --standalone --nproc_per_node=8 --no_python ./dcu_run.sh