"src/vscode:/vscode.git/clone" did not exist on "565b59952fa05d280c9120c0b84e71303c574ca4"
config_DGX1.sh 1.29 KB
Newer Older
huchen's avatar
huchen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
## System run parms
export DGXNNODES=1
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=${WALLTIME:-"00:30:00"}

## DL params
export LR=${LR:-"2.0e-3"}
export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
export WARMUP_STEPS=${WARMUP_STEPS:-200}
export REMAIN_STEPS=${REMAIN_STEPS:-6453}
export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
export TARGET=${TARGET:-24.0}
export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
export NUMEPOCHS=${NUMEPOCHS:-15}
export MATH=${MATH:-fp32}
#export DIST_OPTS=${DIST_OPTS-"\
#   --distributed-weight-update 2 \
#   --dwu-num-blocks 1 \
#   --dwu-num-chunks 2 \
#   --dwu-num-rs-pg 2 \
#   --dwu-num-ar-pg 2 \
#   --dwu-num-ag-pg 0 \
#   --dwu-grad-norm \
#   "}
export DIST_OPTS=${DIST_OPTS-"\
   --dwu-num-blocks 1 \
   --dwu-num-chunks 2 \
   --dwu-num-rs-pg 2 \
   --dwu-num-ar-pg 2 \
   --dwu-num-ag-pg 0 \
   --dwu-grad-norm \
   "}
export EXTRA_OPTS=${EXTRA_OPTS-"\
   --fused-attention \
   --fused-xentropy \
   --no-log-all-ranks \
   "}

## System config params
export DGXNGPU=4
export DGXSOCKETCORES=8
export DGXHT=1 	# HT is on is 2, HT off is 1
export DGXNSOCKET=4

#export DGXNGPU=1
#export DGXSOCKETCORES=8
#export DGXHT=1 	# HT is on is 2, HT off is 1
#export DGXNSOCKET=1