Unverified Commit 8cacccec authored by jihan.yang's avatar jihan.yang Committed by GitHub
Browse files

fixbug: dist test in torch 2.0 (#1602)

* fixbug: torch 2.0 dist train with error local rank

* fixbug: dist test in torch 2.0
parent 839d8dd9
Pipeline #2714 failed with stages
in 0 seconds
#!/usr/bin/env bash
set -x
NGPUS=$1
PY_ARGS=${@:2}
while true
do
PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 ))
status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)"
if [ "${status}" != "0" ]; then
break;
fi
done
echo $PORT
torchrun --nproc_per_node=${NGPUS} --rdzv_endpoint=localhost:${PORT} test.py --launcher pytorch ${PY_ARGS}
...@@ -29,7 +29,7 @@ def parse_config(): ...@@ -29,7 +29,7 @@ def parse_config():
parser.add_argument('--pretrained_model', type=str, default=None, help='pretrained_model') parser.add_argument('--pretrained_model', type=str, default=None, help='pretrained_model')
parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none')
parser.add_argument('--tcp_port', type=int, default=18888, help='tcp port for distrbuted training') parser.add_argument('--tcp_port', type=int, default=18888, help='tcp port for distrbuted training')
parser.add_argument('--local_rank', type=int, default=0, help='local rank for distributed training') parser.add_argument('--local_rank', type=int, default=None, help='local rank for distributed training')
parser.add_argument('--set', dest='set_cfgs', default=None, nargs=argparse.REMAINDER, parser.add_argument('--set', dest='set_cfgs', default=None, nargs=argparse.REMAINDER,
help='set extra config keys if needed') help='set extra config keys if needed')
...@@ -145,6 +145,9 @@ def main(): ...@@ -145,6 +145,9 @@ def main():
dist_test = False dist_test = False
total_gpus = 1 total_gpus = 1
else: else:
if args.local_rank is None:
args.local_rank = int(os.environ.get('LOCAL_RANK', '0'))
total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)( total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)(
args.tcp_port, args.local_rank, backend='nccl' args.tcp_port, args.local_rank, backend='nccl'
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment