af.log 394 KB
Newer Older
wangsen's avatar
all  
wangsen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
    PID TTY      STAT   TIME COMMAND
  79819 pts/4    Ss     0:00 -bash
  80848 pts/4    S+     0:00  \_ bash run.sh
  80849 pts/4    Sl+    0:00      \_ /opt/mpi/bin/mpirun --allow-run-as-root -np 64 --hostfile hostfile --tag-output --merge-stderr-to-stdout --output-filename tmp -mca plm_rsh_args -p 3344 -x NCCL_SOCKET_IFNAME=ens1f0 ./llama3_70b.sh node11
  80854 pts/4    S      0:00          \_ /usr/bin/ssh -x -p 3344 node12     PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 1 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated"
  80855 pts/4    S      0:00          \_ /usr/bin/ssh -x -p 3344 node13     PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 2 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated"
  80856 pts/4    S      0:00          \_ /usr/bin/ssh -x -p 3344 node14     PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 3 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated"
  80857 pts/4    S      0:00          \_ /usr/bin/ssh -x -p 3344 node15     PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 4 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated"
  80858 pts/4    S      0:00          \_ /usr/bin/ssh -x -p 3344 node16     PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 5 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated"
  80859 pts/4    S      0:00          \_ /usr/bin/ssh -x -p 3344 node17     PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 6 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated"
  80860 pts/4    S      0:00          \_ /usr/bin/ssh -x -p 3344 node18     PATH=/opt/mpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/mpi/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /opt/mpi/bin/orted -mca ess "env" -mca ess_base_jobid "2143813632" -mca ess_base_vpid 7 -mca ess_base_num_procs "8" -mca orte_node_regex "node[2:11-18]@0(8)" -mca orte_hnp_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2143813632.0;tcp://172.16.1.11,10.1.1.11,172.17.0.1,192.168.35.193:48505" -mca plm_rsh_args "-p 3344" -mca orte_tag_output "1" -mca orte_output_filename "tmp" -mca pmix "^s1,s2,cray,isolated"
  80861 pts/4    S      0:00          \_ /bin/bash ./llama3_70b.sh node11
  80883 pts/4    SLl   10:40          |   \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81108 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81124 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81136 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81148 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81161 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81173 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81182 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81196 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81206 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81218 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81229 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81242 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81253 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81263 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81273 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81287 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81294 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81303 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81310 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81318 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81326 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81334 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81342 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81348 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81357 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81365 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81372 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81380 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81388 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81396 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81400 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81406 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81831 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  81894 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  82022 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  82023 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  82027 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  82028 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 0 --world_size 64 --dist_url tcp://node11:34566
  80862 pts/4    S      0:00          \_ /bin/bash ./llama3_70b.sh node11
  80888 pts/4    SLl   10:39          |   \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81261 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81281 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81292 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81300 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81308 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81316 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81324 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81332 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81341 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81349 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81358 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81366 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81375 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81383 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81393 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81399 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81405 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81413 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81416 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81418 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81420 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81422 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81424 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81426 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81428 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81430 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81432 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81434 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81436 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81438 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81440 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  81442 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 1 --world_size 64 --dist_url tcp://node11:34566
  80865 pts/4    S      0:00          \_ /bin/bash ./llama3_70b.sh node11
  80900 pts/4    SLl   10:37          |   \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81102 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81117 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81129 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81140 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81153 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81166 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81178 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81190 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81200 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81215 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81228 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81240 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81251 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81264 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81276 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81288 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81296 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81304 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81312 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81319 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81327 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81335 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81343 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81350 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81359 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81368 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81376 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81384 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81392 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81398 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81404 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  81411 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 2 --world_size 64 --dist_url tcp://node11:34566
  80868 pts/4    S      0:00          \_ /bin/bash ./llama3_70b.sh node11
  80898 pts/4    SLl   10:39          |   \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  80956 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  80961 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  80965 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  80972 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  80979 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  80986 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  80994 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81001 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81009 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81015 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81025 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81031 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81039 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81047 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81056 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81063 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81072 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81082 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81091 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81098 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81106 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81119 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81131 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81141 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81154 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81167 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81179 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81191 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81202 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81214 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81226 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  81237 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 3 --world_size 64 --dist_url tcp://node11:34566
  80872 pts/4    S      0:00          \_ /bin/bash ./llama3_70b.sh node11
  80904 pts/4    SLl   10:38          |   \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  80963 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  80973 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  80981 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  80988 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  80995 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81004 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81013 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81020 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81029 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81037 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81045 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81053 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81061 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81071 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81081 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81088 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81096 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81105 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81118 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81128 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81138 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81152 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81165 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81176 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81189 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81201 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81213 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81225 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81238 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81250 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81262 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  81274 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 4 --world_size 64 --dist_url tcp://node11:34566
  80876 pts/4    S      0:00          \_ /bin/bash ./llama3_70b.sh node11
  80905 pts/4    SLl   10:38          |   \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80925 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80927 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80929 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80931 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80933 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80935 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80937 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80939 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80941 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80943 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80945 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80947 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80949 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80951 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80953 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80955 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80959 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80962 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80970 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80976 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80985 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80993 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80999 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81007 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81017 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81024 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81032 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81041 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81049 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81057 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81065 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  81075 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 5 --world_size 64 --dist_url tcp://node11:34566
  80882 pts/4    S      0:00          \_ /bin/bash ./llama3_70b.sh node11
  80907 pts/4    SLl   10:37          |   \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81068 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81084 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81093 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81101 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81111 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81123 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81133 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81147 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81157 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81172 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81184 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81197 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81208 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81221 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81233 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81245 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81254 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81269 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81279 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81291 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81298 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81306 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81314 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81323 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81331 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81339 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81346 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81355 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81362 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81370 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81378 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  81385 pts/4    Sl     0:00          |       \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 6 --world_size 64 --dist_url tcp://node11:34566
  80886 pts/4    S      0:00          \_ /bin/bash ./llama3_70b.sh node11
  80908 pts/4    SLl   10:39              \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  80969 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  80980 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  80989 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  80998 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81005 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81014 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81021 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81030 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81038 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81046 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81054 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81062 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81070 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81079 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81087 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81095 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81104 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81116 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81127 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81139 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81150 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81164 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81175 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81188 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81199 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81211 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81222 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81235 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81248 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81258 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81270 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  81282 pts/4    Sl     0:00                  \_ python3 -u pretrain_gpt.py --num-layers 80 --hidden-size 16384 --num-attention-heads 128 --ffn-hidden-size 20480 --seq-length 8192 --max-position-embeddings 8192 --num-query-groups 8 --group-query-attention --log-throughput --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 1 --train-iters 120 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-flash-attn-triton --optimizer adam --use-distributed-optimizer --ddp-average-in-collective --overlap-grad-reduce --disable-bias-linear --recompute-activations --attention-dropout 0 --hidden-dropout 0 --no-gradient-accumulation-fusion --swiglu --lr 1.5e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --sequence-parallel --tensor-model-parallel-size 8 --pipeline-model-parallel-size 8 --data-path /mnt/fs/user/llama/panhw/Megatron-LM-main/dataset/alpaca_text_document --split 949,50,1 --untie-embeddings-and-output-weights --use-rotary-position-embeddings --normalization RMSNorm --no-position-embedding --tokenizer-type HuggingFaceTokenizer --log-interval 1 --log-throughput --save-interval 10000 --eval-interval 1000 --save /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --load /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --eval-iters 1000 --tensorboard-dir /mnt/fs/user/llama/panhw/Megatron-LM-main/tmp_8b --rank 7 --world_size 64 --dist_url tcp://node11:34566
  82346 pts/15   Ss     0:00 bash
  82366 pts/15   S+     0:00  \_ vim llama3_70b.sh
  79835 pts/5    Ss     0:00 bash
  82389 pts/5    R+     0:00  \_ ps af
  79661 pts/14   Ss+    0:00 bash
  65721 pts/3    Ss+    0:00 bash
  65679 pts/2    Ss     0:00 /bin/bash
  65701 pts/2    S+     0:00  \_ ssh node27 -p 3344
  64008 pts/1    Ss+    0:00 bash
      1 pts/0    Ss+    0:00 /bin/bash