config_DGX1_multi.sh 1.13 KB
Newer Older
huchen's avatar
huchen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
## System run parms
export DGXNNODES=2
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=${WALLTIME:-"00:30:00"}

## DL params
#export LR=${LR:-"2.0e-3"}
#export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
#export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
#export WARMUP_STEPS=${WARMUP_STEPS:-200}
#export REMAIN_STEPS=${REMAIN_STEPS:-6453}
#export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
#export TARGET=${TARGET:-24.0}
#export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
#export NUMEPOCHS=${NUMEPOCHS:-15}
#export MATH=${MATH:-fp32}
#export DIST_OPTS=${DIST_OPTS-"\
#   --distributed-weight-update 2 \
#   --dwu-num-blocks 1 \
#   --dwu-num-chunks 2 \
#   --dwu-num-rs-pg 2 \
#   --dwu-num-ar-pg 2 \
#   --dwu-num-ag-pg 0 \
#   --dwu-grad-norm \
#   "}
#export EXTRA_OPTS=${EXTRA_OPTS-"\
#   --fused-attention \
#   --fused-xentropy \
#   --no-log-all-ranks \
#   "}

## System config params
export DGXNGPU=4
export DGXSOCKETCORES=8
export DGXHT=2 	# HT is on is 2, HT off is 1
export DGXNSOCKET=4

#export DGXNGPU=1
#export DGXSOCKETCORES=8
#export DGXHT=1 	# HT is on is 2, HT off is 1
#export DGXNSOCKET=1