run1.sh 730 Bytes
Newer Older
hepj's avatar
hepj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/bin/bash

export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3

export MIOPEN_ENABLE_LOGGING_CMD=1
export ROCBLAS_LAYER=3


module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE

#NCCL_DEBUG=INFO

export HIP_VISIBLE_DEVICES=0
python -m torch.distributed.launch --master_port 50130 --nnodes 1  --nproc_per_node=1 --use_env  main.py \
  --model Conformer_small_patch16 \
  --data-set IMNET \
  --batch-size 64 \
  --lr 0.001 \
  --num_workers 1 \
  --data-path /public/software/apps/DeepLearning/Data/ImageNet-pytorch \
  --output_dir /public/home/hepj/SothisAI/Conformer-main/out_dir_1 \
  --epochs 1