#!/bin/bash #SBATCH --job-name=LLAMA #SBATCH --partition=kshdnormal01 #SBATCH --nodes=8 #SBATCH --cpus-per-task=32 #SBATCH --ntasks-per-node=1 #SBATCH --gres=dcu:4 #SBATCH --mem=100G #SBATCH --wait-all-nodes=1 #SBATCH --exclusive #SBATCH --output log/%j.out.log #SBATCH --error log/%j.err.log M_NODE=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) M_ADDR=$(scontrol show node=$M_NODE | grep NodeAddr | awk -F' ' '{print $1}' | awk -F'=' '{print $2}') M_PORT=12345 echo "SLURMD_NODENAME=$SLURMD_NODENAME" source env.sh export NODE=$SLURM_NNODES export ADDR=$M_ADDR export PORT=$M_PORT echo "NODE=$SLURM_NNODES" echo "ADDR=$M_ADDR" echo "PORT=$M_PORT" srun bash train.sh 4