slurm-37908748.out 8.18 KB
Newer Older
hepj987's avatar
hepj987 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
+ HOME_PATH=/public/home/hepj
+ WORK_PATH=/public/home/hepj/torch/BERT/2node-run-squad
+ source /public/home/hepj/env22.10.sh
/opt/gridview/slurm/spool_slurmd/job37908748/slurm_script: line 13: /public/home/hepj/env22.10.sh: No such file or directory
+ which python3
/public/home/hepj/job_env/dtk22.10-torch-1.10-py3.7/bin/python3
+ hostfile=./37908748
+ scontrol show hostnames 'j20r4n[01-02]'
++ cat ./37908748
+ for i in '`cat $hostfile`'
+ echo j20r4n01 slots=4
++ pwd
+ (( num_node=+1 ))
+ for i in '`cat $hostfile`'
+ echo j20r4n02 slots=4
++ pwd
+ (( num_node=1+1 ))
+ num_dcu=8
+ echo 8
8
++ cat ./37908748
++ sed -n 1p
+ nodename=j20r4n01
+ echo j20r4n01
j20r4n01
++ echo j20r4n01
++ awk '{print $1}'
+ dist_url=j20r4n01
+ export HSA_USERPTR_FOR_PAGED_MEM=0
+ HSA_USERPTR_FOR_PAGED_MEM=0
+ mpirun -np 8 --hostfile hostfile-37908748 /public/home/hepj/torch/BERT/2node-run-squad/2nodes_single_process.sh j20r4n01
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 0 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 1 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 2 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 3 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 4 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 5 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 6 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 7 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[60184,1],5]
  Exit code:    2
--------------------------------------------------------------------------