"include/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "27b00d74169ac7756c48d7b6878d66fa5d678530"
Commit 63567b0c authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add model mobilenetv2

parents
import torch
import os
def init_ddp(visiable_devices='0,1,2,3'):
if torch.cuda.device_count() > 1:
os.environ['HIP_VISIBLE_DEVICES'] = visiable_devices
local_rank = int(os.environ["LOCAL_RANK"])
print("local_rank:" + str(local_rank))
#torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1)
torch.distributed.init_process_group(backend="nccl")
# local_rank = torch.distributed.get_rank()
torch.cuda.set_device(local_rank)
# device = torch.device("cuda", args.local_rank)
return local_rank
else:
return None
This source diff could not be displayed because it is too large. You can view the blob instead.
import cv2
import csv
import numpy as np
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import torch
from torchvision import transforms
class DataFile():
def __init__(self, path, local_rank):
self.labels = []
self.pics = []
self.usage = []
self.local_rank = local_rank
f = open(path, 'r')
ln = 0
ts_proc = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
for row in csv.reader(f):
if ln != 0:
self.labels.append(int(row[0]))
arr = row[1].split(' ')
arr = [int(x) for x in arr]
nparr = np.array(arr, dtype=np.uint8)
rmk_img = cv2.resize(nparr, (224,224))
rmk_img = cv2.cvtColor(rmk_img, cv2.COLOR_GRAY2RGB)
ft_ts = ts_proc(rmk_img)
self.pics.append(ft_ts)
self.usage.append(row[2])
ln += 1
if ln % 5000 == 0 and (local_rank == None or local_rank == 0):
print("{} pics loaded.".format(ln))
f.close()
def to_file(self):
pass
def get_data(self):
return self.labels, self.pics, self.usage
class LabelFile():
def __init__(self, path, local_rank):
self.labels = []
f = open(path, 'r')
ln = 0
for row in csv.reader(f, delimiter=','):
if ln != 0:
# print(row)
lab_cells = row[2:]
# print(lab_cells)
lab_cells = np.array(lab_cells, dtype=np.uint8)
lab = np.argmax(lab_cells)
self.labels.append(lab)
ln += 1
if ln % 5000 == 0 and (local_rank == None or local_rank == 0):
print("{} labels loaded.".format(ln))
f.close()
def get_labels(self):
return self.labels
class Fer2013Dataset(Dataset):
def __init__(self, local_rank):
print('local_rank_datawork:',local_rank)
#self.datafile = DataFile('data/fer2013/fer2013.csv', local_rank)
#self.labelfile = LabelFile('data/fer2013/fer2013new_ms_labs.csv', local_rank)
self.datafile = DataFile('data/fer2013//DDP_data_231017.csv', local_rank)
self.mode = 'train'
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
if local_rank == None:
self.randomization(0)
else:
self.randomization(local_rank)
def randomization(self, seed):
labels, pics, usage = self.datafile.get_data()
# ms_labels = self.labelfile.get_labels()
tarpics = []
tarlabs = []
for i in range(0, len(labels)):
if labels[i] == 1:
tarpics.append(pics[i])
tarlabs.append(0)
if labels[i] == 2:
tarpics.append(pics[i])
tarlabs.append(1)
if labels[i] == 3:
tarpics.append(pics[i])
tarlabs.append(2)
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
tarpics, tarlabs, test_size=0.2, random_state=0, stratify=tarlabs)
def __len__(self):
if self.mode == 'train':
return len(self.y_train)
elif self.mode == 'test':
return len(self.y_test)
def __getitem__(self, index):
if self.mode == 'train':
return self.X_train[index], torch.tensor(self.y_train[index])
elif self.mode == 'test':
return self.X_test[index], torch.tensor(self.y_test[index])
def set_mode(self, mode):
self.mode = mode
def show_pic(pixels):
cv2.imshow('Show', pixels)
cv2.waitKey(0)
cv2.destroyAllWindows()
if __name__ == '__main__':
# data = DataFile('data/fer2013/fer2013.csv')
# labels, pics, usage = data.get_data()
# for i in range(0, len(labels)):
# show_pic(pics[i])
# labels = LabelFile('data/fer2013/fer2013new_ms_labs.csv', 0).get_labels()
# print('done')
x = Fer2013Dataset(0)
print('done')
import torch
import torch.nn as nn
# from peselibs_config import get_lib_path
import sys
# sys.path.append(get_lib_path())
import DDP
import torchvision.models.mobilenet as mobilenet
from datawork import *
from sklearn.metrics import accuracy_score
from fitlog import FitLog
from torch.utils.data import DataLoader
import time
import random
g_dubug = False
class MobileNetV2Driver():
def __init__(self, local_rank):#DDP: system initialization
self.nclass = 9
self.batch_size = 64
self.local_rank = local_rank
self.nepoch = 500
self.nround = 10
self.lr = 0.00001
self.loader = None
self.test_loader = None
self.dataset = None
self.device = None
#model & device
self.model = mobilenet.MobileNetV2(num_classes=self.nclass)
# print("local_rank:{}".format(local_rank))
self._init_device()
self.model.to(self.device)
if self.local_rank != None:
self.model = nn.parallel.DistributedDataParallel(
self.model, device_ids=[self.local_rank], find_unused_parameters=True)
self.criterion = nn.CrossEntropyLoss()
# self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)
self.optimizer = torch.optim.Adam(self.model.parameters(),lr=0.001,betas=(0.9,0.999))
#self.scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft, step_size=1, gamma=0.98)
self.dataset = Fer2013Dataset(local_rank)
try:
self.sampler = torch.utils.data.distributed.DistributedSampler(self.dataset)
except:
self.sampler=None
if self.local_rank != None:
self.loader = DataLoader(
self.dataset, batch_size=self.batch_size,sampler=self.sampler, shuffle=False)
else:
self.loader = DataLoader(
self.dataset, batch_size=self.batch_size, shuffle=True)
self.test_loader = DataLoader(
self.dataset, batch_size=self.batch_size,shuffle=True)
def _init_device(self):
if self.local_rank != None:
self.device = torch.device('cuda', self.local_rank)
else:
if torch.cuda.is_available():
self.device = torch.device('cuda')
else:
self.device = torch.device('cpu')
def init_dataset(self, seed):
self.dataset.randomization(seed)
def train(self):
best_acc = 0
best_acc_at = 0
if self.local_rank == 0 or self.local_rank == None:
self.fitlog = FitLog("./logs/")
self.jishilog = FitLog("./logs/",prefix='jishi')
self.dlog = FitLog("./logs/", prefix='pred')
st_time=time.time()
for epoch in range(self.nepoch):
self.dataset.set_mode("train")
self.model.train()
all_loss = []
for batch_idx, (data, target) in enumerate(self.loader):
data, target = data.to(self.device), target.to(self.device)
self.dataset.set_mode("train")
##################train time
if self.local_rank == 0 or self.local_rank == None:
jishi1=time.time()
self.model.train()
self.optimizer.zero_grad()
output = self.model(data)
loss = self.criterion(output, target)
loss.backward()
self.optimizer.step()
if self.local_rank == 0 or self.local_rank == None:
jishi2=time.time()
jishi2_log = '****epc:{},process:{}/{},best_acc:{},start:{},end:{},duration:{}****'.format(str(epoch), str(batch_idx * len(data)), str(len(self.loader.dataset)), str(best_acc),str(jishi1), str(jishi2),str(jishi2 - jishi1))
jishi2_log=str(jishi2_log)
self.jishilog.append(jishi2_log)
print(jishi2_log)
#########################train time
all_loss.append(loss.item())
t1=time.time()
duration=t1-st_time
if (batch_idx % 10 == 0) and (self.local_rank == 0 or self.local_rank == None):
btstr = 'epc: {} [{}/{} ({:.0f}%)] loss: {:.6f} b-acc: {:.3f} @:{},curtime:{},duration:{}'.format(
epoch, batch_idx * len(data), len(self.loader.dataset),
100. * batch_idx / len(self.loader), loss.item(), best_acc, best_acc_at,str(t1),str(duration))
self.fitlog.append(btstr)
# print(btstr)
if g_dubug:
break
torch.save(self.model,'./mobilenet.pth')
if self.local_rank == 0 or self.local_rank == None:
t1=time.time()
duration=t1-st_time
acc, vloss, vloss_std,all_pred, all_tar = self._validate()
epcstr = '****epc:{},loss:{:.6f},loss_std:{:.6f},vloss:{:.6f},vloss_std:{:.6f},acc:{:.3f},duration:{}****'.format(
epoch, np.mean(all_loss), np.std(all_loss), vloss, vloss_std,acc,str(duration))
#self.dlog.append(epcstr+",preds:{},plabs:{}".format(str(all_pred), str(all_tar)))
self.dlog.append(epcstr)
self.dlog.append("pred"+str(all_pred))
self.dlog.append("tar"+str(all_tar))
if acc > best_acc:
best_acc = acc
best_acc_at = epoch
print(epcstr)
if g_dubug:
break
if self.local_rank == 0 or self.local_rank == None:
self.fitlog.close()
self.dlog.close()
self.jishilog.close()
def _validate(self):
self.model.eval()
self.dataset.set_mode('test')
all_pred = []
all_tar = []
accs = []
all_loss = []
with torch.no_grad():
for i, (ft, labs) in enumerate(self.test_loader):
ft, labs = ft.to(self.device), labs.to(self.device)
output = self.model(ft)
loss = self.criterion(output, labs)
preds = torch.argmax(output, dim=1).cpu().numpy().tolist()
all_pred.extend(preds)
all_tar.extend(labs.cpu().numpy().tolist())
accs.append(accuracy_score(all_tar, all_pred))
all_loss.append(loss.item())
if i % 100 == 0:
print('validating @ batch {}'.format(i))
if g_dubug:
break
return np.mean(accs), np.mean(all_loss),np.std(all_loss), all_pred, all_tar
def run(self, iround):
self.init_dataset(iround)
self.train()
if __name__ == '__main__':
print(torch.cuda.is_available())
############
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
##############
t0 =time.time()
local_rank = DDP.init_ddp()
print("local_rank=",local_rank)
driver = MobileNetV2Driver(local_rank=local_rank)
# print("round {}".format(sys.argv[1]))
iround=1
driver.run(iround)
t1 =time.time()
print("result_time=",(t1-t0)/1000)
import torch
import torch.nn as nn
# from peselibs_config import get_lib_path
import sys
# sys.path.append(get_lib_path())
import DDP
import torchvision.models.mobilenet as mobilenet
from datawork import *
from sklearn.metrics import accuracy_score
from fitlog import FitLog
from torch.utils.data import DataLoader
import time
import torch.distributed as dist
import os
import datetime
import argparse
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
g_dubug = False
class MobileNetV2Driver():
def __init__(self,args):#DDP: system initialization
self.nclass = 9
self.batch_size = 128 #64
#self.local_rank = local_rank
self.nepoch = 50
self.nround = 10
self.lr = 0.00001
self.loader = None
self.test_loader = None
self.dataset = None
self.device = None
self.args = args
self.local_rank = args.rank
#model & device
self.model = mobilenet.MobileNetV2(num_classes=self.nclass)
print("local_rank:{}".format(local_rank))
self._init_device()
self.model.to(self.device)
print('$$$$$$$$$$$$$$$$$$$$$$$$',self.device)
if self.local_rank != None:
self.model = nn.parallel.DistributedDataParallel(
self.model, device_ids=[self.local_rank%4],output_device=local_rank%4,find_unused_parameters=True)
self.criterion = nn.CrossEntropyLoss()
# self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)
self.optimizer = torch.optim.Adam(self.model.parameters(),lr=0.00001,betas=(0.9,0.999))
#self.scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft, step_size=1, gamma=0.98)
print('##################################self.dataset#############################')
self.dataset = Fer2013Dataset(local_rank%4)
print('self_data_load finish$$$$$$$$$$$$$$$$$$$$$')
try:
self.sampler = torch.utils.data.distributed.DistributedSampler(self.dataset,num_replicas=args.world_size, rank=args.rank)
except:
self.sampler=None
if self.local_rank != None:
self.loader = DataLoader(
self.dataset, batch_size=self.batch_size, sampler=self.sampler, shuffle=False)
else:
self.loader = DataLoader(
self.dataset, batch_size=self.batch_size, shuffle=True)
self.test_loader = DataLoader(
self.dataset, batch_size=self.batch_size, shuffle=True)
print('&&&&&&&&&&&&&&&&&&&&&dataset end&&&&&&&&&&&&&&&&&&&&&&&&&')
#self.model = torch.nn.parallel.DistributedDataParallel(self.model,device_ids=[self.local_rank])
def _init_device(self):
if self.local_rank != None:
self.device = torch.device('cuda', self.local_rank % 4)
else:
if torch.cuda.is_available():
self.device = torch.device('cuda')
else:
self.device = torch.device('cpu')
def init_dataset(self, seed):
self.dataset.randomization(seed)
def train(self):
best_acc = 0
best_acc_at = 0
if self.local_rank == 0 or self.local_rank == None:
self.fitlog = FitLog("logs/")
self.jishilog = FitLog("logs/",prefix='jishi')
self.dlog = FitLog("logs/", prefix='pred')
st_time=time.time()
for epoch in range(self.nepoch):
self.dataset.set_mode("train")
self.model.train()
all_loss = []
for batch_idx, (data, target) in enumerate(self.loader):
data, target = data.to(self.device), target.to(self.device)
self.dataset.set_mode("train")
if self.local_rank == 0 or self.local_rank == None:
jishi1=time.time()
self.model.train()
self.optimizer.zero_grad()
output = self.model(data)
loss = self.criterion(output, target)
loss.backward()
self.optimizer.step()
if self.local_rank == 0 or self.local_rank == None:
jishi2=time.time()
jishi2_log='****epc:{},process:{}/{},start:{},end:{},duration:{}****'.format(str(epoch),str(batch_idx* len(data)),str(len(self.loader.dataset)), str(jishi1),str(jishi2),str(jishi2-jishi1))
#print(jishi2_log)
jishi2_log=str(jishi2_log)
self.jishilog.append(jishi2_log)
print(jishi2_log)
all_loss.append(loss.item())
t1=time.time()
duration=t1-st_time
if (batch_idx % 10 == 0) and (self.local_rank == 0 or self.local_rank == None):
btstr = 'epc: {} [{}/{} ({:.0f}%)] loss: {:.6f} b-acc: {:.3f} @:{},curtime:{},duration:{}'.format(
epoch, batch_idx * len(data), len(self.loader.dataset),
100. * batch_idx / len(self.loader), loss.item(), best_acc, best_acc_at,str(t1),str(duration))
self.fitlog.append(btstr)
# print(btstr)
if g_dubug:
break
torch.save(self.model,'./mobilenet.pth')
if self.local_rank == 0 or self.local_rank == None:
t1=time.time()
duration=t1-st_time
acc, vloss, vloss_std,all_pred, all_tar = self._validate()
epcstr = '****epc:{},loss:{:.6f},loss_std:{:.6f},vloss:{:.6f},vloss_std:{:.6f},acc:{:.3f},duration:{}****'.format(
epoch, np.mean(all_loss), np.std(all_loss), vloss, vloss_std,acc,str(duration))
self.dlog.append(epcstr+",preds:{},plabs:{}".format(str(all_pred), str(all_tar)))
if acc > best_acc:
best_acc = acc
best_acc_at = epoch
print(epcstr)
if g_dubug:
break
if self.local_rank == 0 or self.local_rank == None:
self.fitlog.close()
self.dlog.close()
self.jishilog.close()
def _validate(self):
self.model.eval()
self.dataset.set_mode('test')
all_pred = []
all_tar = []
accs = []
all_loss = []
with torch.no_grad():
for i, (ft, labs) in enumerate(self.test_loader):
ft, labs = ft.to(self.device), labs.to(self.device)
output = self.model(ft)
loss = self.criterion(output, labs)
preds = torch.argmax(output, dim=1).cpu().numpy().tolist()
all_pred.extend(preds)
all_tar.extend(labs.cpu().numpy().tolist())
accs.append(accuracy_score(all_tar, all_pred))
all_loss.append(loss.item())
if i % 100 == 0:
print('validating @ batch {}'.format(i))
if g_dubug:
break
return np.mean(accs), np.mean(all_loss),np.std(all_loss), all_pred, all_tar
def run(self, iround):
self.init_dataset(iround)
self.train()
def init_ddp(args,visiable_devices='0,1,2,3'):
if torch.cuda.device_count() > 1:
#os.environ['HIP_VISIBLE_DEVICES'] = visiable_devices
local_rank = args.rank #int(os.environ["LOCAL_RANK"])
print("local_rank:" + str(local_rank))
#torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1)
#torch.distributed.init_process_group(backend="nccl")
print(args.dist_backend)
print(args.dist_url)
print(args.world_size)
print(args.rank)
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# local_rank = torch.distributed.get_rank()
torch.cuda.set_device(local_rank % 4)
# device = torch.device("cuda", args.local_rank)
return local_rank
else:
return None
if __name__ == '__main__':
print("torch.cuda.is_available",torch.cuda.is_available())
args = parser.parse_args()
t0 =time.time()
print(torch.cuda.device_count())
local_rank = init_ddp(args)
print(local_rank)
driver = MobileNetV2Driver(args)
# print("round {}".format(sys.argv[1]))
iround=1
driver.run(iround)
t1 =time.time()
print("result_time=",(t1-t0)/1000)
#dist.destroy_process_group()
import datetime
import os
class FitLog:
def __init__(self, folderpath="", fname=None, prefix=''):
self.fname = fname
if self.fname == None:
self.fname = prefix + datetime.datetime.now().strftime("%y%m%d%H%M%S" + ".log")
self.fh = open(folderpath + self.fname, 'w', newline='')
def append(self, line, with_time=False, change_line=True):
str2append = ""
if with_time is False:
str2append = line
else:
str2append = str(datetime.datetime.now()) + " " + line
if change_line is True:
str2append += os.linesep
self.fh.write(str2append)
self.fh.flush()
def close(self):
self.fh.flush()
self.fh.close()
\ No newline at end of file
File added
#!/bin/bash
#SBATCH -J test
#SBATCH -p wzhdexclu03
#SBATCH -N 1
##SBATCH -n 32
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=dcu:1
source ~/miniconda3/etc/profile.d/conda.sh
conda activate torch1.10-dtk22.10-py38
#conda activate base
module purge
module load compiler/devtoolset/7.3.1 mpi/hpcx/gcc-7.3.1 compiler/dtk/23.04
module list
python -u driver.py #需要运行的程序
export LD_LIBRARY_PATH=${UCX_HOME}/lib:$LD_LIBRARY_PATH
env > env_$SLURM_JOBID
node_list=(`nodeset -e ${SLURM_NODELIST}`)
master_node=${node_list[0]}
for((i=0;i<${SLURM_NNODES};i++))
do
echo ${node_list[$i]} slots=1 >> hostfile-$SLURM_JOB_ID
done
echo mpirun -np ${SLURM_NNODES} --hostfile hostfile-$SLURM_JOB_ID ./single_process.sh ${SLURM_NNODES} ${master_node}
mpirun -np ${SLURM_NNODES} --hostfile hostfile-$SLURM_JOB_ID ./single_process.sh ${SLURM_NNODES} ${master_node}
#!/bin/bash
sbatch run_multi_onenode.sh
def get_lib_path():
return "F:/SynologyDrive/ProjectsExtend/PESELibs/"
\ No newline at end of file
import pandas as pd
from PIL import Image
import numpy as np
df=pd.read_csv("data/fer2013/fer2013.csv",chunksize=200)
for d in df:
pixels=d['pixels'].apply(lambda x:x.split(" ")).tolist()
# pixels=pixels.split(" ")
pix=pixels[0]
pix_2arry=[]
pix = [int(x) for x in pix]
for i in range(0,len(pix),64):
pix_2arry.append(pix[i:i+64])
img = Image.fromarray(np.array(pix_2arry).astype(np.uint8))
img.show()
\ No newline at end of file
blinker==1.6.3
certifi==2023.7.22
charset-normalizer==3.2.0
click==8.1.7
docopt==0.6.2
filelock==3.12.4
Flask==3.0.0
gitdb==4.0.10
GitPython==3.1.37
idna==3.4
importlib-metadata==6.8.0
itsdangerous==2.1.2
Jinja2==3.1.2
joblib==1.3.2
MarkupSafe==2.1.3
numpy==1.24.4
opencv-python==4.8.1.78
Pillow==10.0.0
pip==23.2.1
requests==2.31.0
scikit-learn==1.3.1
scipy==1.10.1
setuptools==68.0.0
smmap==5.0.1
threadpoolctl==3.2.0
typing_extensions==4.7.1
urllib3==2.0.4
Werkzeug==3.0.0
wheel==0.38.4
zipp==3.17.0
#/bin/bash
mkdir -p logs
#rm -rf log/*
mkdir -p hostfile
sbatch run_mpi.sh
#!/bin/bash
#SBATCH -J test
#SBATCH -p wzhdexclu03
#SBATCH -N 4
#SBATCH -n 32
##SBATCH --ntasks-per-node=4
##SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
source ~/miniconda3/etc/profile.d/conda.sh
#conda activate torch1.10-dtk22.10-py38
conda activate base
module switch compiler/dtk/22.10
python -u driver.py #需要运行的程序
#!/bin/bash
#SBATCH -p wzhdexclu03
#SBATCH -N 2
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --gres=dcu:4
#SBATCH -J multi_machine_dcu
#SBATCH -o logs/pt-%j.out
#SBATCH -e logs/pt-%j.err
echo "START TIME: $(date)"
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
echo ${dist_url}
source ~/miniconda3/etc/profile.d/conda.sh
conda activate torch1.10-dtk23.04.1-py38
#conda activate base
module purge
module load compiler/devtoolset/7.3.1 mpi/hpcx/gcc-7.3.1 compiler/dtk/23.04.1
module list
mpirun -np $np --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single_process.sh $dist_url
#!/bin/bash
#SBATCH -p wzhdexclu03
#SBATCH -N 1
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --gres=dcu:4
#SBATCH -J onenode_4dcu
#SBATCH -o logs/pt-%j.out
#SBATCH -e logs/pt-%j.err
source ~/miniconda3/etc/profile.d/conda.sh
conda activate torch1.10-dtk22.10-py38
#conda activate base
module purge
module load compiler/devtoolset/7.3.1 mpi/hpcx/gcc-7.3.1 compiler/dtk/23.04
module list
export HIP_VISIBLE_DEVICES=0,1,2,3
python3 -m torch.distributed.run --nproc_per_node 4 driver.py #需要运行的程序
#!/bin/bash
#SBATCH -p wzhdexclu03
#SBATCH -N 1
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --gres=dcu:4
#SBATCH -J single_dcu
#SBATCH -o logs/pt-%j.out
#SBATCH -e logs/pt-%j.err
source ~/miniconda3/etc/profile.d/conda.sh
conda activate torch1.10-dtk22.10-py38
#conda activate base
module purge
module load compiler/devtoolset/7.3.1 mpi/hpcx/gcc-7.3.1 compiler/dtk/23.04
module list
export HIP_VISIBLE_DEVICES=0
python -u driver.py #需要运行的程序
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment