+ HOME_PATH=/public/home/hepj + WORK_PATH=/public/home/hepj/torch/BERT/2node-run-squad + source /public/home/hepj/env22.10.sh /opt/gridview/slurm/spool_slurmd/job37908748/slurm_script: line 13: /public/home/hepj/env22.10.sh: No such file or directory + which python3 /public/home/hepj/job_env/dtk22.10-torch-1.10-py3.7/bin/python3 + hostfile=./37908748 + scontrol show hostnames 'j20r4n[01-02]' ++ cat ./37908748 + for i in '`cat $hostfile`' + echo j20r4n01 slots=4 ++ pwd + (( num_node=+1 )) + for i in '`cat $hostfile`' + echo j20r4n02 slots=4 ++ pwd + (( num_node=1+1 )) + num_dcu=8 + echo 8 8 ++ cat ./37908748 ++ sed -n 1p + nodename=j20r4n01 + echo j20r4n01 j20r4n01 ++ echo j20r4n01 ++ awk '{print $1}' + dist_url=j20r4n01 + export HSA_USERPTR_FOR_PAGED_MEM=0 + HSA_USERPTR_FOR_PAGED_MEM=0 + mpirun -np 8 --hostfile hostfile-37908748 /public/home/hepj/torch/BERT/2node-run-squad/2nodes_single_process.sh j20r4n01 NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 0 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567 NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 1 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567 NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 2 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567 NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 3 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567 NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 4 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567 NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 5 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567 NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 6 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567 NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 7 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567 (null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory (null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory (null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory (null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory (null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory (null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory (null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory (null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory -------------------------------------------------------------------------- Primary job terminated normally, but 1 process returned a non-zero exit code. Per user-direction, the job has been aborted. -------------------------------------------------------------------------- -------------------------------------------------------------------------- mpirun detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: Process name: [[60184,1],5] Exit code: 2 --------------------------------------------------------------------------