Commit a4a4ae0f authored by sunzhq2's avatar sunzhq2
Browse files

update facenet migraphx infer

parent 00169466
...@@ -22,6 +22,8 @@ from tqdm import tqdm ...@@ -22,6 +22,8 @@ from tqdm import tqdm
from bert4torch.models import BaseModelDDP from bert4torch.models import BaseModelDDP
import os import os
import time import time
import onnxruntime as ort
maxlen = 256 maxlen = 256
batch_size = 64 batch_size = 64
...@@ -64,19 +66,21 @@ class MyDataset(ListDataset): ...@@ -64,19 +66,21 @@ class MyDataset(ListDataset):
# 建立分词器 # 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True) tokenizer = Tokenizer(dict_path, do_lower_case=True)
# if os.path.isfile("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.mxr"): if os.path.isfile("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.mxr"):
# model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.mxr") model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.mxr")
# else: else:
# # 加载模型 # 加载模型
# maxInput={"input":[64,256]} print("************load onnx model*************")
# model = migraphx.parse_onnx("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.onnx", map_input_dims=maxInput) maxInput={"input":[64,256]}
model = migraphx.parse_onnx("/home/sunzhq/workspace/yidong-infer/bert/bert4torch_cmcc/examples/sequence_labeling/mla-bert_best.onnx", map_input_dims=maxInput)
migraphx.quantize_fp16(model)
# migraphx.quantize_fp16(model) # 编译
model.compile(migraphx.get_target("gpu"),device_id=0)
# # 编译
# model.compile(migraphx.get_target("gpu"),device_id=0)
model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/new_modle_1.mxr") # model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/new_modle_1.mxr")
def collate_fn(batch): def collate_fn(batch):
batch_token_ids, batch_labels = [], [] batch_token_ids, batch_labels = [], []
...@@ -321,7 +325,7 @@ def build_model(config_path, checkpoint_path): ...@@ -321,7 +325,7 @@ def build_model(config_path, checkpoint_path):
return model return model
if __name__ == '__main__': if __name__ == '__main__':
ptmodel = build_model("/datasets/bert-base-chinese/config.json", "./best_model.pt") ptmodel = build_model("/datasets/bert-base-chinese/config.json", "/models/best_model.pt")
crf = ptmodel.crf crf = ptmodel.crf
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt' # time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
......
...@@ -25,16 +25,16 @@ import time ...@@ -25,16 +25,16 @@ import time
import multiprocessing as mp import multiprocessing as mp
from multiprocessing import Process, Queue, Manager from multiprocessing import Process, Queue, Manager
maxlen = 256
batch_size = 64
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG'] categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)} categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)} categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base # BERT base
maxlen = 256
batch_size = 64
config_path = '/datasets/bert-base-chinese/config.json' config_path = '/datasets/bert-base-chinese/config.json'
dict_path = '/datasets/bert-base-chinese/vocab.txt' dict_path = '/datasets/bert-base-chinese/vocab.txt'
device = "cuda"
gpuid = os.getenv('HIP_VISIBLE_DEVICES') gpuid = os.getenv('HIP_VISIBLE_DEVICES')
labdir = os.path.join('results', gpuid, 'label') labdir = os.path.join('results', gpuid, 'label')
resultdir = os.path.join('results', gpuid, 'data') resultdir = os.path.join('results', gpuid, 'data')
...@@ -72,18 +72,19 @@ class MyDataset(ListDataset): ...@@ -72,18 +72,19 @@ class MyDataset(ListDataset):
# 建立分词器 # 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True) tokenizer = Tokenizer(dict_path, do_lower_case=True)
if os.path.isfile("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best_fp16.mxr"): if os.path.isfile("/models/bert_best_fp16.mxr"):
model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best_fp16.mxr") print("***********load mxr model******************")
print("###############migraphx-driver#####################") model = migraphx.load("/models/bert_best_fp16.mxr")
else: else:
# 加载模型 # 加载模型
maxInput={"input":[64,256]} maxInput={"input":[64,256]}
model = migraphx.parse_onnx("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.onnx", map_input_dims=maxInput) model = migraphx.parse_onnx("/models/bert_best.onnx", map_input_dims=maxInput)
migraphx.quantize_fp16(model) migraphx.quantize_fp16(model)
# 编译 # 编译
model.compile(migraphx.get_target("gpu"),offload_copy=False, device_id=0) model.compile(migraphx.get_target("gpu"),offload_copy=False, device_id=0)
inputName=list(model.get_inputs().keys())[0] inputName=list(model.get_inputs().keys())[0]
modelData=AllocateOutputMemory(model) modelData=AllocateOutputMemory(model)
...@@ -93,23 +94,6 @@ def collate_fn(batch): ...@@ -93,23 +94,6 @@ def collate_fn(batch):
batch_token_ids, batch_labels = [], [] batch_token_ids, batch_labels = [], []
maxlen = 256 maxlen = 256
for d in batch: for d in batch:
# # import pdb;pdb.set_trace()
# tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
# mapping = tokenizer.rematch(d[0], tokens)
# start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
# end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
# token_ids = tokenizer.tokens_to_ids(tokens)
# labels = np.zeros(len(token_ids))
# for start, end, label in d[1:]:
# if start in start_mapping and end in end_mapping:
# start = start_mapping[start]
# end = end_mapping[end]
# labels[start] = categories_label2id['B-'+label]
# labels[start + 1:end + 1] = categories_label2id['I-'+label]
# batch_token_ids.append(token_ids)
# batch_labels.append(labels)
# batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
# batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
tokens = tokenizer.tokenize(d[0], maxlen=maxlen) # 截断到 maxlen tokens = tokenizer.tokenize(d[0], maxlen=maxlen) # 截断到 maxlen
mapping = tokenizer.rematch(d[0], tokens) mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j} start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
...@@ -130,10 +114,10 @@ def collate_fn(batch): ...@@ -130,10 +114,10 @@ def collate_fn(batch):
batch_labels.append(labels) batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen, value=tokenizer._token_pad_id), batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen, value=tokenizer._token_pad_id),
dtype=torch.long, dtype=torch.long,
device=device) device="cuda:0")
batch_labels = torch.tensor(sequence_padding(batch_labels, length=maxlen, value=-100), batch_labels = torch.tensor(sequence_padding(batch_labels, length=maxlen, value=-100),
dtype=torch.long, dtype=torch.long,
device=device) device="cuda:0")
return batch_token_ids, batch_labels return batch_token_ids, batch_labels
# 转换数据集 # 转换数据集
...@@ -163,8 +147,6 @@ def pad_data_bin(data, output, bs, seq=256, len_catagory=7): ...@@ -163,8 +147,6 @@ def pad_data_bin(data, output, bs, seq=256, len_catagory=7):
return pad_data(data, seq) return pad_data(data, seq)
#crf = CRF(len(categories)).to(device)
#crf = CRF(len(categories))
def evaluate(data): def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10 X, Y, Z = 1e-10, 1e-10, 1e-10
...@@ -203,7 +185,7 @@ def evaluate(data): ...@@ -203,7 +185,7 @@ def evaluate(data):
preds_dcu = model.run(modelData) preds_dcu = model.run(modelData)
end += time.time() - start end += time.time() - start
infer_times.append(time.time() - start) infer_times.append(time.time() - start)
# print("******************:", 64/infer_times[-1]) print(f"****infer time: {infer_times[-1]} s***** fps: {64/infer_times[-1]}*********")
total_infer_times.append(time.time() - total_start) total_infer_times.append(time.time() - total_start)
result_1 = np.array(migraphx.from_gpu(preds_dcu[0])) result_1 = np.array(migraphx.from_gpu(preds_dcu[0]))
result_2 = np.array(migraphx.from_gpu(preds_dcu[1])) result_2 = np.array(migraphx.from_gpu(preds_dcu[1]))
...@@ -256,7 +238,6 @@ def evaluate(data): ...@@ -256,7 +238,6 @@ def evaluate(data):
data_idx += 1 data_idx += 1
total_start = time.time() total_start = time.time()
print("total_sample_data:", (64 * data_idx)) print("total_sample_data:", (64 * data_idx))
#avg_infer_time = sum(infer_times[1:]) / len(infer_times[1:])
avg_infer_fps = 64 * len(infer_times) / sum(infer_times) avg_infer_fps = 64 * len(infer_times) / sum(infer_times)
print(f"total_infer_time: {end}s") print(f"total_infer_time: {end}s")
print(f'avg_infer_fps: {avg_infer_fps}samples/s') print(f'avg_infer_fps: {avg_infer_fps}samples/s')
...@@ -270,28 +251,6 @@ def evaluate(data): ...@@ -270,28 +251,6 @@ def evaluate(data):
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2 f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2 return f1, precision, recall, f2, precision2, recall2
# def trans_entity2tuple(scores):
# '''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
# '''
# batch_entity_ids = set()
# for i, one_samp in enumerate(scores):
# entity_ids = []
# for j, item in enumerate(one_samp):
# flag_tag = categories_id2label[item.item()]
# if flag_tag.startswith('B-'): # B
# entity_ids.append([i, j, j, flag_tag[2:]])
# elif len(entity_ids) == 0:
# continue
# elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
# entity_ids[-1][-2] = j
# elif len(entity_ids[-1]) > 0:
# entity_ids.append([])
# for i in entity_ids:
# if i:
# batch_entity_ids.add(tuple(i))
# return batch_entity_ids
def trans_entity2tuple(scores): def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标''' '''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标'''
...@@ -343,7 +302,8 @@ def build_model(config_path, checkpoint_path): ...@@ -343,7 +302,8 @@ def build_model(config_path, checkpoint_path):
return model return model
if __name__ == '__main__': if __name__ == '__main__':
ptmodel = build_model("/datasets/bert-base-chinese/config.json", "./best_model.pt")
ptmodel = build_model("/datasets/bert-base-chinese/config.json", "/models/best_model.pt")
crf = ptmodel.crf crf = ptmodel.crf
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt' # time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
......
...@@ -30,7 +30,7 @@ categories_label2id = {k: i for i, k in enumerate(categories)} ...@@ -30,7 +30,7 @@ categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base # BERT base
config_path = '/datasets/bert-base-chinese/config.json' config_path = '/datasets/bert-base-chinese/config.json'
checkpoint_path = "/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/best_model.pt" checkpoint_path = "/models/best_model.pt"
dict_path = '/datasets/bert-base-chinese/vocab.txt' dict_path = '/datasets/bert-base-chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cuda' if torch.cuda.is_available() else 'cpu'
#local_rank = int(os.environ['LOCAL_RANK']) #local_rank = int(os.environ['LOCAL_RANK'])
...@@ -131,7 +131,7 @@ optimizer = optim.Adam(model.parameters(), lr=6e-5) ...@@ -131,7 +131,7 @@ optimizer = optim.Adam(model.parameters(), lr=6e-5)
model.compile( model.compile(
loss=Loss(), loss=Loss(),
optimizer=optimizer, optimizer=optimizer,
use_apex=True, #此处设置是否采用apex_amp的混合精度 # use_apex=True, #此处设置是否采用apex_amp的混合精度
) )
#------------------------------------------------------------ #------------------------------------------------------------
...@@ -176,7 +176,7 @@ if __name__ == '__main__': ...@@ -176,7 +176,7 @@ if __name__ == '__main__':
# time_fw写入程序开始执行的时间 # time_fw写入程序开始执行的时间
time_fw.write('Start Time: {:.6f}\n'.format(time.time())) time_fw.write('Start Time: {:.6f}\n'.format(time.time()))
model.load_weights("best_model.pt") model.load_weights("/models/best_model.pt")
evaluate(valid_dataloader) evaluate(valid_dataloader)
...@@ -125,3 +125,38 @@ Start Time: 1768874181.911044 ...@@ -125,3 +125,38 @@ Start Time: 1768874181.911044
End Time: 1768874210.087952 End Time: 1768874210.087952
Start Time: 1768874181.908558 Start Time: 1768874181.908558
End Time: 1768874210.517354 End Time: 1768874210.517354
Start Time: 1768958356.017406
End Time: 1768958383.966635
Start Time: 1768959668.076344
End Time: 1768959695.458553
Start Time: 1768966453.254532
Start Time: 1768966837.469851
Start Time: 1768966890.201351
Start Time: 1768966964.168061
Start Time: 1768967210.695314
End Time: 1768967238.448165
Start Time: 1768967388.564694
End Time: 1768967417.126865
Start Time: 1768967463.287595
Start Time: 1768973606.038938
Start Time: 1768973646.659818
Start Time: 1768973846.640323
Start Time: 1768974140.134768
Start Time: 1768974561.282243
Start Time: 1768975120.477424
Start Time: 1768975207.924210
Start Time: 1768975279.667178
Start Time: 1768975518.253238
Start Time: 1768975874.877119
Start Time: 1768975995.112462
Start Time: 1768976069.282189
Start Time: 1768976100.216421
Start Time: 1768976128.923981
Start Time: 1768976237.096360
Start Time: 1768976314.320467
Start Time: 1768976357.442332
Start Time: 1768976437.747921
Start Time: 1768976772.689363
Start Time: 1768983198.628691
Start Time: 1768983225.806162
Start Time: 1768983269.424441
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
nohup numactl -N 0 -m 0 python3 src/migraphx_infer.py 2>&1 | tee result_0.log &
export HIP_VISIBLE_DEVICES=1
nohup numactl -N 1 -m 1 python3 src/migraphx_infer.py 2>&1 | tee result_1.log &
export HIP_VISIBLE_DEVICES=2
nohup numactl -N 2 -m 2 python3 src/migraphx_infer.py 2>&1 | tee result_2.log &
export HIP_VISIBLE_DEVICES=3
nohup numactl -N 3 -m 3 python3 src/migraphx_infer.py 2>&1 | tee result_3.log &
# python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \ # python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
# --image_size 160 \ # --image_size 160 \
# --distance_metric 1 \ # --distance_metric 1 \
...@@ -8,37 +17,37 @@ export HIP_VISIBLE_DEVICES=0 ...@@ -8,37 +17,37 @@ export HIP_VISIBLE_DEVICES=0
# --lfw_batch_size 64 # --lfw_batch_size 64
nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \ # nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
--image_size 160 \ # --image_size 160 \
--distance_metric 1 \ # --distance_metric 1 \
--use_flipped_images \ # --use_flipped_images \
--subtract_mean \ # --subtract_mean \
--use_fixed_image_standardization \ # --use_fixed_image_standardization \
--lfw_batch_size 64 2>&1 | tee result_0.log & # --lfw_batch_size 64 2>&1 | tee result_0.log &
export HIP_VISIBLE_DEVICES=1 # export HIP_VISIBLE_DEVICES=1
nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \ # nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
--image_size 160 \ # --image_size 160 \
--distance_metric 1 \ # --distance_metric 1 \
--use_flipped_images \ # --use_flipped_images \
--subtract_mean \ # --subtract_mean \
--use_fixed_image_standardization \ # --use_fixed_image_standardization \
--lfw_batch_size 64 2>&1 | tee result_1.log & # --lfw_batch_size 64 2>&1 | tee result_1.log &
export HIP_VISIBLE_DEVICES=2 # export HIP_VISIBLE_DEVICES=2
nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \ # nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
--image_size 160 \ # --image_size 160 \
--distance_metric 1 \ # --distance_metric 1 \
--use_flipped_images \ # --use_flipped_images \
--subtract_mean \ # --subtract_mean \
--use_fixed_image_standardization \ # --use_fixed_image_standardization \
--lfw_batch_size 64 2>&1 | tee result_2.log & # --lfw_batch_size 64 2>&1 | tee result_2.log &
export HIP_VISIBLE_DEVICES=3 # export HIP_VISIBLE_DEVICES=3
nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \ # nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
--image_size 160 \ # --image_size 160 \
--distance_metric 1 \ # --distance_metric 1 \
--use_flipped_images \ # --use_flipped_images \
--subtract_mean \ # --subtract_mean \
--use_fixed_image_standardization \ # --use_fixed_image_standardization \
--lfw_batch_size 64 2>&1 | tee result_3.log & # --lfw_batch_size 64 2>&1 | tee result_3.log &
# export HIP_VISIBLE_DEVICES=4 # export HIP_VISIBLE_DEVICES=4
# nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \ # nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
# --image_size 160 \ # --image_size 160 \
......
...@@ -123,6 +123,7 @@ RANDOM_FLIP = 4 ...@@ -123,6 +123,7 @@ RANDOM_FLIP = 4
FIXED_STANDARDIZATION = 8 FIXED_STANDARDIZATION = 8
FLIP = 16 FLIP = 16
def create_input_pipeline(input_queue, image_size, nrof_preprocess_threads, batch_size_placeholder): def create_input_pipeline(input_queue, image_size, nrof_preprocess_threads, batch_size_placeholder):
import pdb;pdb.set_trace()
with tf.name_scope("tempscope"): with tf.name_scope("tempscope"):
images_and_labels_list = [] images_and_labels_list = []
for _ in range(nrof_preprocess_threads): for _ in range(nrof_preprocess_threads):
......
import os
import numpy as np
from PIL import Image # 推荐用于读取和调整图像
import onnxruntime as ort
import argparse
import lfw # 假设你有这个模块来读取 pairs 和 paths
import sys
# Import metrics and interpolation functions for AUC/EER calculation
from sklearn import metrics
from scipy.optimize import brentq
from scipy import interpolate
import time
from tqdm import tqdm
import migraphx
def AllocateOutputMemory(model):
outputData={}
for key in model.get_outputs().keys():
outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
return outputData
def preprocess_image(image_path, target_size=(160, 160)):
img = Image.open(image_path).convert('RGB') # 确保是 RGB 三通道
img = img.resize(target_size, Image.Resampling.BILINEAR) # 或 Image.LANCZOS
img_np = np.array(img, dtype=np.float32)
img_np = (img_np - 127.5) / 128.0
return img_np
def load_lfw_for_onnx(lfw_dir, pairs_file, batch_size, image_size=(160, 160)):
pairs = lfw.read_pairs(os.path.expanduser(pairs_file))
paths, actual_issame = lfw.get_paths(os.path.expanduser(lfw_dir), pairs)
nrof_pairs = len(actual_issame)
all_processed_images = []
all_labels = []
all_actual_issame_full = []
for i in range(nrof_pairs):
path0 = paths[i*2]
path1 = paths[i*2+1]
actual_same = actual_issame[i]
# Process each image in the pair
for img_path in [path0, path1]:
label = os.path.basename(os.path.dirname(img_path))
# Original image
processed_img = preprocess_image(
img_path,
target_size=image_size,
)
processed_img = np.transpose(processed_img, (2, 0, 1))
processed_img = np.ascontiguousarray(processed_img)
all_processed_images.append(processed_img)
all_labels.append(label)
all_actual_issame_full.append(actual_same)
num_batches = len(all_processed_images) // batch_size
if len(all_processed_images) % batch_size != 0:
print(f"Warning: Number of images ({len(all_processed_images)}) is not evenly divisible by batch size ({batch_size}). Last batch will be smaller.")
num_batches += 1
for i in range(num_batches):
start_idx = i * batch_size
end_idx = min(start_idx + batch_size, len(all_processed_images))
batch_images = all_processed_images[start_idx:end_idx]
batch_labels = all_labels[start_idx:end_idx]
batch_actual_issame_part = all_actual_issame_full[start_idx:end_idx]
batch_array = np.stack(batch_images, axis=0)
batch_array = batch_array.astype(np.float32)
yield batch_array, batch_labels, batch_actual_issame_part
def main_onnx(args):
embedding_size = 512
pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs)
nrof_pairs = len(actual_issame)
nrof_images = nrof_pairs * 2
print(f"Number of pairs: {nrof_pairs}, Number of images: {nrof_images}, Embedding size: {embedding_size}")
data_generator = load_lfw_for_onnx(
args.lfw_dir,
args.lfw_pairs,
args.lfw_batch_size,
image_size=(args.image_size, args.image_size),
)
total_images_calculated = len(paths)
num_batches_calculated = total_images_calculated // args.lfw_batch_size
if total_images_calculated % args.lfw_batch_size != 0:
num_batches_calculated += 1
all_embeddings = np.zeros((nrof_images, embedding_size), dtype=np.float32)
current_image_index = 0
model = migraphx.load(args.model_path)
inputName=list(model.get_inputs().keys())[0]
modelData=AllocateOutputMemory(model)
# warm up
modelData[inputName] = migraphx.to_gpu(migraphx.argument(np.ones([64,3,160,160]).astype(np.float32)))
model.run(modelData)
infer_times = []
total_infer_times = []
total_start = time.time()
for i, (batch_images, _, _) in enumerate(tqdm(data_generator, total=num_batches_calculated, desc="Processing Batches")):
original_batch_size = batch_images.shape[0]
if original_batch_size < 64:
pad_size = 64 - original_batch_size
padding_images = np.repeat(batch_images[-1:], pad_size, axis=0)
batch_images = np.concatenate((batch_images, padding_images), axis=0)
modelData[inputName] = migraphx.to_gpu(migraphx.argument(batch_images))
start = time.time()
embeddings_dcu = model.run(modelData)
embeddings_1 = np.array(migraphx.from_gpu(embeddings_dcu[0]))
infer_time_taken = time.time() - start
infer_times.append(infer_time_taken)
if original_batch_size == 64:
embeddings = embeddings_1
else:
embeddings = embeddings_1[:original_batch_size]
batch_size_current = embeddings.shape[0]
all_embeddings[current_image_index:current_image_index + batch_size_current] = embeddings
current_image_index += batch_size_current # Move index forward
if i % 10 == 9:
print('.', end='')
sys.stdout.flush()
total_infer_times.append(time.time() - total_start)
total_start = time.time() # Reset timer for next batch's data loading + inference
print("\nAll batches processed.")
nrof_embeddings = nrof_pairs * 2
final_embeddings = np.zeros((nrof_embeddings, embedding_size), dtype=np.float32)
final_embeddings = all_embeddings
distance_metric = 1 # Euclidean
subtract_mean = True
nrof_folds = 10
tpr, fpr, accuracy, val, val_std, far = lfw.evaluate(
final_embeddings,
actual_issame,
nrof_folds=nrof_folds,
distance_metric=distance_metric,
subtract_mean=subtract_mean
)
print('Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy)))
print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
auc = metrics.auc(fpr, tpr)
print('Area Under Curve (AUC): %1.3f' % auc)
eer = brentq(lambda x: 1. - x - interpolate.interp1d(fpr, tpr)(x), 0., 1.)
print('Equal Error Rate (EER): %1.3f' % eer)
print("***************************")
infer_time = sum(infer_times)
avg_infer_fps = 64 * len(infer_times) / sum(infer_times)
print(f"total_infer_time: {infer_time}s")
print(f'avg_infer_fps: {avg_infer_fps}samples/s')
load_data_infer_time = sum(total_infer_times)
load_data_avg_infer_fps = len(total_infer_times) * 64 / sum(total_infer_times)
print(f'load_data_total_infer_time: {load_data_infer_time}s')
print(f'load_data_avg_total_Infer_fps: {load_data_avg_infer_fps} samples/s')
print("******************************")
def parse_arguments_onnx():
parser = argparse.ArgumentParser()
parser.add_argument('--lfw_dir', type=str, default="/datasets/lfw_mtcnnpy_160", help='Path to the data directory containing aligned LFW face patches.')
parser.add_argument('--lfw_batch_size', type=int, help='Number of images to process in a batch in the LFW test set.', default=64) # Changed default to common ONNX batch size
parser.add_argument('--model_path', type=str, default="/home/sunzhq/workspace/yidong-infer/facenet/facenet/tools/onnx-models/facenet_static_bs64.mxr", help='Path to the ONNX model file.')
parser.add_argument('--image_size', type=int, help='Image size (height, width) in pixels.', default=160)
parser.add_argument('--lfw_pairs', type=str, help='The file containing the pairs to use for validation.', default='data/pairs.txt')
return parser.parse_args()
if __name__ == '__main__':
args = parse_arguments_onnx()
main_onnx(args)
import os
import numpy as np
from PIL import Image
import argparse
import lfw
import sys
from sklearn import metrics
from scipy.optimize import brentq
from scipy import interpolate
import time
from tqdm import tqdm
import migraphx
def AllocateOutputMemory(model):
outputData={}
for key in model.get_outputs().keys():
outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
return outputData
def evaluate_embeddings_with_different_methods(embeddings, actual_issame, use_flipped_images, embedding_size):
"""用不同方法评估嵌入向量"""
results = {}
nrof_pairs = len(actual_issame)
# 方法0: 原始方法(不使用翻转)
if not use_flipped_images:
tpr, fpr, accuracy, val, val_std, far = lfw.evaluate(
embeddings,
actual_issame,
nrof_folds=10,
distance_metric=1,
subtract_mean=True
)
results["original"] = {
"accuracy": np.mean(accuracy),
"std": np.std(accuracy),
"auc": metrics.auc(fpr, tpr)
}
# 方法1: 原始TF的拼接方式
elif embeddings.shape[0] == nrof_pairs * 4: # 有翻转图像
# 方法1A: 原始 + 翻转
final_embeddings = np.zeros((nrof_pairs * 2, embedding_size * 2))
final_embeddings[:, :embedding_size] = embeddings[0::2]
final_embeddings[:, embedding_size:] = embeddings[1::2]
tpr, fpr, accuracy, val, val_std, far = lfw.evaluate(
final_embeddings,
actual_issame,
nrof_folds=10,
distance_metric=1,
subtract_mean=True
)
results["original+flipped"] = {
"accuracy": np.mean(accuracy),
"std": np.std(accuracy),
"auc": metrics.auc(fpr, tpr)
}
return results
def main_optimized(args):
"""优化后的主函数"""
# 加载模型
model = migraphx.load(args.migraphx_model_path)
input_name = list(model.get_inputs().keys())[0]
modelData=AllocateOutputMemory(model)
embedding_size=512
print("="*70)
# 加载数据
pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs)
nrof_pairs = len(actual_issame)
# 准备所有图像路径和翻转标志
all_image_paths = []
flip_flags = []
print("\nPreparing image paths...")
for i in tqdm(range(nrof_pairs), desc="Organizing pairs"):
path0 = paths[i*2]
path1 = paths[i*2+1]
# 第一张图像
all_image_paths.append(path0)
flip_flags.append(False)
if args.use_flipped_images:
all_image_paths.append(path0)
flip_flags.append(True)
# 第二张图像
all_image_paths.append(path1)
flip_flags.append(False)
if args.use_flipped_images:
all_image_paths.append(path1)
flip_flags.append(True)
nrof_images = len(all_image_paths)
print(f"Total images to process: {nrof_images}")
# 预分配嵌入向量存储
all_embeddings = np.zeros((nrof_images, embedding_size), dtype=np.float32)
# 推理
print("\nRunning inference...")
infer_times = []
for start_idx in tqdm(range(0, nrof_images, args.lfw_batch_size), desc="Processing"):
end_idx = min(start_idx + args.lfw_batch_size, nrof_images)
batch_paths = all_image_paths[start_idx:end_idx]
batch_flip_flags = flip_flags[start_idx:end_idx]
# 预处理批次
batch_images = []
for img_path, flip_flag in zip(batch_paths, batch_flip_flags):
# 使用PIL读取和预处理
img = Image.open(img_path).convert('RGB')
img = img.resize((args.image_size, args.image_size), Image.Resampling.BILINEAR)
img_np = np.array(img, dtype=np.float32)
if flip_flag:
img_np = np.fliplr(img_np)
# FaceNet标准化
img_np = (img_np - 127.5) / 128.0
# CHW格式
img_np = np.transpose(img_np, (2, 0, 1))
batch_images.append(img_np)
batch_array = np.stack(batch_images, axis=0).astype(np.float32)
# 为MIGraphX准备批次
if batch_array.shape[0] < 64:
pad_size = 64 - batch_array.shape[0]
padding = np.repeat(batch_array[-1:], pad_size, axis=0)
batch_for_infer = np.concatenate([batch_array, padding], axis=0)
else:
batch_for_infer = batch_array
# 转换为MIGraphX参数
batch_for_infer = np.ascontiguousarray(batch_for_infer)
# mgx_arg = migraphx.to_gpu(migraphx.argument(batch_for_infer))
# model_data = {input_name: mgx_arg}
modelData[input_name] = migraphx.to_gpu(migraphx.argument(batch_for_infer))
# 推理
infer_start = time.time()
output = model.run(modelData)
infer_time = time.time() - infer_start
infer_times.append(infer_time)
# 提取嵌入向量
embeddings_np = np.array(migraphx.from_gpu(output[0]))
if batch_array.shape[0] < 64:
embeddings_np = embeddings_np[:batch_array.shape[0]]
all_embeddings[start_idx:end_idx] = embeddings_np
print("\n" + "="*70)
print("EVALUATION RESULTS")
print("="*70)
# 使用不同方法评估
results = evaluate_embeddings_with_different_methods(
all_embeddings,
actual_issame,
args.use_flipped_images,
embedding_size
)
# 打印结果
print("\nComparison of different methods:")
print("-"*70)
for method_name, result in results.items():
print(f"{method_name:20} | Accuracy: {result['accuracy']:.5f} ± {result['std']:.5f} | AUC: {result['auc']:.3f}")
# 性能统计
if infer_times:
total_infer_time = sum(infer_times)
avg_fps = nrof_images / total_infer_time
print("\n" + "="*70)
print("PERFORMANCE STATISTICS")
print("-"*70)
print(f"Total inference time: {total_infer_time:.3f}s")
print(f"Average FPS: {avg_fps:.1f} images/s")
print(f"Number of images: {nrof_images}")
if args.use_flipped_images:
print(f" (Note: {nrof_pairs * 2} original images + their flips)")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--lfw_dir', type=str, default="/datasets/lfw_mtcnnpy_160")
parser.add_argument('--lfw_batch_size', type=int, default=64)
parser.add_argument('--migraphx_model_path', type=str,
default="/home/sunzhq/workspace/yidong-infer/facenet/facenet/tools/onnx-models/facenet_static_bs64_fp32.mxr")
parser.add_argument('--image_size', type=int, default=160)
parser.add_argument('--lfw_pairs', type=str, default='data/pairs.txt')
parser.add_argument('--use_flipped_images', action='store_true')
parser.add_argument('--use_fixed_image_standardization', action='store_true')
args = parser.parse_args()
main_optimized(args)
\ No newline at end of file
import onnx
# 加载 ONNX 模型
model_path = "/home/sunzhq/workspace/yidong-infer/facenet/facenet/tools/onnx-models/facenet_static_bs64.onnx" # 请替换为您的实际 .onnx 文件路径
model = onnx.load(model_path)
# 检查模型是否有效
onnx.checker.check_model(model)
# 获取图 (graph) 信息
graph = model.graph
print("--- Model Info ---")
print(f"Model Name: {model.producer_name or 'Unknown'}")
print(f"ONNX Version: {model.ir_version}")
print("\n--- Input Information ---")
for input_tensor in graph.input:
print(f"Name: {input_tensor.name}")
print(f"Type: {input_tensor.type.tensor_type.elem_type}")
# 解析 shape
shape_dim = [dim.dim_param if dim.dim_param else dim.dim_value for dim in input_tensor.type.tensor_type.shape.dim]
print(f"Shape: {shape_dim}")
print("-" * 20)
print("\n--- Output Information ---")
for output_tensor in graph.output:
print(f"Name: {output_tensor.name}")
print(f"Type: {output_tensor.type.tensor_type.elem_type}")
# 解析 shape
shape_dim = [dim.dim_param if dim.dim_param else dim.dim_value for dim in output_tensor.type.tensor_type.shape.dim]
print(f"Shape: {shape_dim}")
print("-" * 20)
# 如果你想查看所有节点 (nodes) 的概览 (可选)
# print("\n--- Node Overview ---")
# for i, node in enumerate(graph.node[:5]): # 只打印前5个节点作为示例
# print(f"Node {i}: {node.op_type} -> {node.output[0]} (inputs: {node.input})")
# if len(graph.node) > 5:
# print(f"... and {len(graph.node) - 5} more nodes")
\ No newline at end of file
migraphx-driver compile ./onnx-models/facenet_static_bs64.onnx \
--binary --fp16 \
--output ./onnx-models/facenet_static_bs64.mxr \
--input-dim @input:0 64 3 160 160
# --input-dim @phase_train:0 0
\ No newline at end of file
import tensorflow as tf
from google.protobuf import text_format
from tensorflow.core.framework import graph_pb2
from tensorflow.python.framework import graph_util
import numpy as np
def fix_phase_train_and_save(input_pb_path, output_pb_path_fixed):
with tf.io.gfile.GFile(input_pb_path, 'rb') as f:
graph_def = tf.compat.v1.GraphDef()
graph_def.ParseFromString(f.read())
new_graph_def = tf.compat.v1.GraphDef()
for node in graph_def.node:
if node.name == 'phase_train' or (node.name == 'phase_train:0' and node.op == 'Placeholder'):
print(f"Found phase_train node: Name='{node.name}', Op='{node.op}', Dtype={node.attr['dtype'].type}")
print(f" - Note: Actual output tensor name is likely '{node.name}:0'")
from tensorflow.core.framework import node_def_pb2, attr_value_pb2, tensor_pb2, types_pb2
from tensorflow.core.framework import tensor_shape_pb2
from tensorflow.python.framework import tensor_util
const_node = node_def_pb2.NodeDef()
const_node.op = "Const"
const_node.name = node.name # Use the same name ('phase_train')
if node.device:
const_node.device = node.device
const_node.attr["dtype"].CopyFrom(node.attr["dtype"]) # Should be DT_BOOL (types_pb2.DT_BOOL)
false_tensor = tensor_pb2.TensorProto(
dtype=types_pb2.DT_BOOL,
bool_val=[False],
# tensor_shape=scalar_tensor_shape
)
false_tensor.ClearField('tensor_shape')
const_node.attr["value"].CopyFrom(attr_value_pb2.AttrValue(tensor=false_tensor))
new_graph_def.node.extend([const_node])
print(f"Replaced '{node.name}' with a scalar Const node having value False.")
else:
new_graph_def.node.extend([node])
output_node_name_without_port = 'embeddings'
try:
new_graph_def = graph_util.remove_training_nodes(
input_graph_def=new_graph_def,
protected_nodes=[]
)
print(f"Applied remove_training_nodes optimization.")
except Exception as e:
print(f"Warning: Could not apply remove_training_nodes: {e}. Proceeding with current graph_def.")
# Save the modified graph
with tf.io.gfile.GFile(output_pb_path_fixed, 'wb') as f:
f.write(new_graph_def.SerializeToString())
print(f"Modified .pb saved to: {output_pb_path_fixed}")
input_pb = "/home/sunzhq/workspace/yidong-infer/facenet/facenet/models_m/facenet-tmp/20180408-102900.pb"
fixed_pb = "/home/sunzhq/workspace/yidong-infer/facenet/facenet/models_m/facenet-tmp/20180408-102900_fixed_scalar.pb"
fix_phase_train_and_save(input_pb, fixed_pb)
print("\nNow run tf2onnx on the fixed .pb file (with scalar phase_train):")
print(f"python -m tf2onnx.convert \\")
print(f" --input {fixed_pb} \\")
print(f" --inputs \"input:0[64,160,160,3]\" \\")
print(f" --outputs embeddings:0 \\") # No more phase_train input needed
print(f" --output ./onnx-models/facenet_static_bs64.onnx \\")
print(f" --opset 11")
\ No newline at end of file
python -m tf2onnx.convert \
--input /home/sunzhq/workspace/yidong-infer/facenet/facenet/models_m/facenet-tmp/20180408-102900_fixed_scalar.pb \
--output ./onnx-models/facenet_static_bs64.onnx \
--inputs input:0[64,160,160,3] \
--outputs embeddings:0 \
--opset 15 \
--inputs-as-nchw input:0
# python -m tf2onnx.convert \
# --input /home/sunzhq/workspace/yidong-infer/facenet/facenet/models_m/facenet-tmp/20180408-102900.pb \
# --inputs "input:0[64,160,160,3]" \
# --inputs "phase_train:0False" \
# --outputs embeddings:0 \
# --output ./onnx-models/facenet_static_bs64.onnx \
# --opset 11
# python -m tf2onnx.convert --input frozen_model.pb --output model.onnx --inputs image_paths:0 --outputs embeddings:0
\ No newline at end of file
...@@ -3,18 +3,18 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1 ...@@ -3,18 +3,18 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
# /datasets/cifar100/cifar-100-python/test # /datasets/cifar100/cifar-100-python/test
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
# numactl -N 0 -m 0 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True numactl -N 0 -m 0 python resnet50_migraphx.py --model ./resnet50.onnx --dataset /datasets/cifar100/test --batch_size 24 --fp16 True
nohup numactl -N 0 -m 0 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_0.log & # nohup numactl -N 0 -m 0 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_0.log &
export HIP_VISIBLE_DEVICES=1 # export HIP_VISIBLE_DEVICES=1
nohup numactl -N 1 -m 1 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_1.log & # nohup numactl -N 1 -m 1 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_1.log &
export HIP_VISIBLE_DEVICES=2 # export HIP_VISIBLE_DEVICES=2
nohup numactl -N 2 -m 2 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_2.log & # nohup numactl -N 2 -m 2 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_2.log &
export HIP_VISIBLE_DEVICES=3 # export HIP_VISIBLE_DEVICES=3
nohup numactl -N 3 -m 3 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_3.log & # nohup numactl -N 3 -m 3 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_3.log &
# export HIP_VISIBLE_DEVICES=4 # export HIP_VISIBLE_DEVICES=4
# nohup python resnet50_migraphx.py --model /workspace/mmpretrain-main/resnet50.onnx --dataset /datasets/cifar100/ --batch_size 24 --fp16 True 2>&1 | tee result_4.log & # nohup python resnet50_migraphx.py --model /workspace/mmpretrain-main/resnet50.onnx --dataset /datasets/cifar100/ --batch_size 24 --fp16 True 2>&1 | tee result_4.log &
......
import argparse import argparse
import argparse
import cv2 import cv2
import numpy as np import numpy as np
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment