update facenet migraphx infer

a4a4ae0f · sunzhq2 · 00169466 · a4a4ae0f · a4a4ae0f · a4a4ae0f
Commit a4a4ae0f authored Jan 23, 2026 by sunzhq2
16 changed files
--- a/bert/bert4torch_cmcc/examples/sequence_labeling/bert_migraphx.py.bak-sunzhq
+++ b/bert/bert4torch_cmcc/examples/sequence_labeling/bert_migraphx.py.bak-sunzhq
@@ -22,6 +22,8 @@ from tqdm import tqdm
 from bert4torch.models import BaseModelDDP
 import os
 import time
+import onnxruntime as ort
 maxlen = 256
 batch_size = 64
@@ -64,19 +66,21 @@ class MyDataset(ListDataset):
 # 建立分词器
 tokenizer = Tokenizer(dict_path, do_lower_case=True)
-# if os.path.isfile("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.mxr"):
+if os.path.isfile("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.mxr"):
-#     model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.mxr")
+    model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.mxr")
-# else:
+else:
-#     # 加载模型
+    # 加载模型
-#     maxInput={"input":[64,256]}
+    print("************load onnx model*************")
-#     model = migraphx.parse_onnx("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.onnx", map_input_dims=maxInput)
+    maxInput={"input":[64,256]}
+    model = migraphx.parse_onnx("/home/sunzhq/workspace/yidong-infer/bert/bert4torch_cmcc/examples/sequence_labeling/mla-bert_best.onnx", map_input_dims=maxInput)
+    migraphx.quantize_fp16(model)
-#     migraphx.quantize_fp16(model)
+    # 编译
+    model.compile(migraphx.get_target("gpu"),device_id=0)
-#     # 编译
-#     model.compile(migraphx.get_target("gpu"),device_id=0)
-model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/new_modle_1.mxr")
+# model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/new_modle_1.mxr")
 def collate_fn(batch):
    batch_token_ids, batch_labels = [], []
@@ -321,7 +325,7 @@ def build_model(config_path, checkpoint_path):
    return model
 if __name__ == '__main__':
-    ptmodel = build_model("/datasets/bert-base-chinese/config.json", "./best_model.pt")
+    ptmodel = build_model("/datasets/bert-base-chinese/config.json", "/models/best_model.pt")
    crf = ptmodel.crf
    # time_fw为存储时间日志的文件对象，文件绝对路径为'log/time/time.txt'

--- a/bert/bert4torch_cmcc/examples/sequence_labeling/bert_migraphx.py
+++ b/bert/bert4torch_cmcc/examples/sequence_labeling/bert_migraphx.py
@@ -25,16 +25,16 @@ import time
 import multiprocessing as mp
 from multiprocessing import Process, Queue, Manager
-maxlen = 256
-batch_size = 64
 categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
 categories_id2label = {i: k for i, k in enumerate(categories)}
 categories_label2id = {k: i for i, k in enumerate(categories)}
 # BERT base
+maxlen = 256
+batch_size = 64
 config_path = '/datasets/bert-base-chinese/config.json'
 dict_path = '/datasets/bert-base-chinese/vocab.txt'
-device = "cuda"
 gpuid = os.getenv('HIP_VISIBLE_DEVICES')
 labdir = os.path.join('results', gpuid, 'label')
 resultdir = os.path.join('results', gpuid, 'data')
@@ -72,18 +72,19 @@ class MyDataset(ListDataset):
 # 建立分词器
 tokenizer = Tokenizer(dict_path, do_lower_case=True)
-if os.path.isfile("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best_fp16.mxr"):
+if os.path.isfile("/models/bert_best_fp16.mxr"):
-    model = migraphx.load("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best_fp16.mxr")
+    print("***********load mxr model******************")
-    print("###############migraphx-driver#####################")
+    model = migraphx.load("/models/bert_best_fp16.mxr")
 else:
    # 加载模型
    maxInput={"input":[64,256]}
-    model = migraphx.parse_onnx("/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/bert_best.onnx", map_input_dims=maxInput)
+    model = migraphx.parse_onnx("/models/bert_best.onnx", map_input_dims=maxInput)
    migraphx.quantize_fp16(model)
    # 编译
    model.compile(migraphx.get_target("gpu"),offload_copy=False, device_id=0)
 inputName=list(model.get_inputs().keys())[0]
 modelData=AllocateOutputMemory(model)
@@ -93,23 +94,6 @@ def collate_fn(batch):
    batch_token_ids, batch_labels = [], []
    maxlen = 256
    for d in batch:
-        # # import pdb;pdb.set_trace()
-        # tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
-        # mapping = tokenizer.rematch(d[0], tokens)
-        # start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
-        # end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
-        # token_ids = tokenizer.tokens_to_ids(tokens)
-        # labels = np.zeros(len(token_ids))
-        # for start, end, label in d[1:]:
-        #     if start in start_mapping and end in end_mapping:
-        #         start = start_mapping[start]
-        #         end = end_mapping[end]
-        #         labels[start] = categories_label2id['B-'+label]
-        #         labels[start + 1:end + 1] = categories_label2id['I-'+label]
-        # batch_token_ids.append(token_ids)
-        # batch_labels.append(labels)
-    # batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
-    # batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
        tokens = tokenizer.tokenize(d[0], maxlen=maxlen)  # 截断到 maxlen
        mapping = tokenizer.rematch(d[0], tokens)
        start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
@@ -130,10 +114,10 @@ def collate_fn(batch):
        batch_labels.append(labels)
    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen, value=tokenizer._token_pad_id), 
                                   dtype=torch.long, 
-                                   device=device)
+                                   device="cuda:0")
    batch_labels = torch.tensor(sequence_padding(batch_labels, length=maxlen, value=-100), 
                                dtype=torch.long, 
-                                device=device)
+                                device="cuda:0")
    return batch_token_ids, batch_labels
 # 转换数据集
@@ -163,8 +147,6 @@ def pad_data_bin(data, output, bs, seq=256, len_catagory=7):
    return pad_data(data, seq)
-#crf = CRF(len(categories)).to(device)
-#crf = CRF(len(categories))
 def evaluate(data):
    X, Y, Z = 1e-10, 1e-10, 1e-10
@@ -203,7 +185,7 @@ def evaluate(data):
        preds_dcu = model.run(modelData)
        end += time.time() - start
        infer_times.append(time.time() - start)
-        # print("******************:", 64/infer_times[-1])
+        print(f"****infer time: {infer_times[-1]} s*****  fps: {64/infer_times[-1]}*********")
        total_infer_times.append(time.time() - total_start)
        result_1 = np.array(migraphx.from_gpu(preds_dcu[0]))
        result_2 = np.array(migraphx.from_gpu(preds_dcu[1]))
@@ -256,7 +238,6 @@ def evaluate(data):
        data_idx += 1
        total_start = time.time()
    print("total_sample_data:", (64 * data_idx))
-    #avg_infer_time = sum(infer_times[1:]) / len(infer_times[1:])
    avg_infer_fps = 64 * len(infer_times) / sum(infer_times)
    print(f"total_infer_time: {end}s")
    print(f'avg_infer_fps: {avg_infer_fps}samples/s')
@@ -270,28 +251,6 @@ def evaluate(data):
    f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
    return f1, precision, recall, f2, precision2, recall2
-# def trans_entity2tuple(scores):
-#     '''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
-#     '''
-#     batch_entity_ids = set()
-#     for i, one_samp in enumerate(scores):
-#         entity_ids = []
-#         for j, item in enumerate(one_samp):
-#             flag_tag = categories_id2label[item.item()]
-#             if flag_tag.startswith('B-'):  # B
-#                 entity_ids.append([i, j, j, flag_tag[2:]])
-#             elif len(entity_ids) == 0:
-#                 continue
-#             elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]):  # I
-#                 entity_ids[-1][-2] = j
-#             elif len(entity_ids[-1]) > 0:
-#                 entity_ids.append([])
-#         for i in entity_ids:
-#             if i:
-#                 batch_entity_ids.add(tuple(i))
-#     return batch_entity_ids
 def trans_entity2tuple(scores):
    '''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标'''
@@ -343,7 +302,8 @@ def build_model(config_path, checkpoint_path):
    return model
 if __name__ == '__main__':
-    ptmodel = build_model("/datasets/bert-base-chinese/config.json", "./best_model.pt")
+    ptmodel = build_model("/datasets/bert-base-chinese/config.json", "/models/best_model.pt")
    crf = ptmodel.crf
    # time_fw为存储时间日志的文件对象，文件绝对路径为'log/time/time.txt'

--- a/bert/bert4torch_cmcc/examples/sequence_labeling/bert_to_onnx.py
+++ b/bert/bert4torch_cmcc/examples/sequence_labeling/bert_to_onnx.py
@@ -30,7 +30,7 @@ categories_label2id = {k: i for i, k in enumerate(categories)}
 # BERT base
 config_path = '/datasets/bert-base-chinese/config.json'
-checkpoint_path = "/home/sunzhq/workspace/yidong/bert/bert4torch_cmcc/examples/sequence_labeling/best_model.pt"
+checkpoint_path = "/models/best_model.pt"
 dict_path = '/datasets/bert-base-chinese/vocab.txt'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 #local_rank = int(os.environ['LOCAL_RANK'])
@@ -131,7 +131,7 @@ optimizer = optim.Adam(model.parameters(), lr=6e-5)
 model.compile(
    loss=Loss(),
    optimizer=optimizer,  
-    use_apex=True, #此处设置是否采用apex_amp的混合精度
+    # use_apex=True, #此处设置是否采用apex_amp的混合精度
 )
 #------------------------------------------------------------
@@ -176,7 +176,7 @@ if __name__ == '__main__':
    # time_fw写入程序开始执行的时间
    time_fw.write('Start Time: {:.6f}\n'.format(time.time()))
-    model.load_weights("best_model.pt")
+    model.load_weights("/models/best_model.pt")
    evaluate(valid_dataloader)
--- a/bert/bert4torch_cmcc/examples/sequence_labeling/log/time.txt
+++ b/bert/bert4torch_cmcc/examples/sequence_labeling/log/time.txt
@@ -125,3 +125,38 @@ Start Time: 1768874181.911044
 End Time: 1768874210.087952
 Start Time: 1768874181.908558
 End Time: 1768874210.517354
+Start Time: 1768958356.017406
+End Time: 1768958383.966635
+Start Time: 1768959668.076344
+End Time: 1768959695.458553
+Start Time: 1768966453.254532
+Start Time: 1768966837.469851
+Start Time: 1768966890.201351
+Start Time: 1768966964.168061
+Start Time: 1768967210.695314
+End Time: 1768967238.448165
+Start Time: 1768967388.564694
+End Time: 1768967417.126865
+Start Time: 1768967463.287595
+Start Time: 1768973606.038938
+Start Time: 1768973646.659818
+Start Time: 1768973846.640323
+Start Time: 1768974140.134768
+Start Time: 1768974561.282243
+Start Time: 1768975120.477424
+Start Time: 1768975207.924210
+Start Time: 1768975279.667178
+Start Time: 1768975518.253238
+Start Time: 1768975874.877119
+Start Time: 1768975995.112462
+Start Time: 1768976069.282189
+Start Time: 1768976100.216421
+Start Time: 1768976128.923981
+Start Time: 1768976237.096360
+Start Time: 1768976314.320467
+Start Time: 1768976357.442332
+Start Time: 1768976437.747921
+Start Time: 1768976772.689363
+Start Time: 1768983198.628691
+Start Time: 1768983225.806162
+Start Time: 1768983269.424441
--- a/bert/bert4torch_cmcc/examples/sequence_labeling/onnx_sim.py
+++ b/bert/bert4torch_cmcc/examples/sequence_labeling/onnx_sim.py
--- a/bert/bert4torch_cmcc/examples/sequence_labeling/onnx_to_mxr.py
+++ b/bert/bert4torch_cmcc/examples/sequence_labeling/onnx_to_mxr.py
--- a/facenet/facenet/inference.sh
+++ b/facenet/facenet/inference.sh
 export HIP_VISIBLE_DEVICES=0
+nohup numactl -N 0 -m 0 python3 src/migraphx_infer.py 2>&1 | tee result_0.log &
+export HIP_VISIBLE_DEVICES=1
+nohup numactl -N 1 -m 1 python3 src/migraphx_infer.py 2>&1 | tee result_1.log &
+export HIP_VISIBLE_DEVICES=2
+nohup numactl -N 2 -m 2 python3 src/migraphx_infer.py 2>&1 | tee result_2.log &
+export HIP_VISIBLE_DEVICES=3
+nohup numactl -N 3 -m 3 python3 src/migraphx_infer.py 2>&1 | tee result_3.log &
 # python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
 #  --image_size 160 \
 #  --distance_metric 1 \
@@ -8,37 +17,37 @@ export HIP_VISIBLE_DEVICES=0
 #  --lfw_batch_size 64
-nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
+# nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
- --image_size 160 \
+#  --image_size 160 \
- --distance_metric 1 \
+#  --distance_metric 1 \
- --use_flipped_images \
+#  --use_flipped_images \
- --subtract_mean \
+#  --subtract_mean \
- --use_fixed_image_standardization \
+#  --use_fixed_image_standardization \
- --lfw_batch_size 64 2>&1 | tee result_0.log &
+#  --lfw_batch_size 64 2>&1 | tee result_0.log &
-export HIP_VISIBLE_DEVICES=1
+# export HIP_VISIBLE_DEVICES=1
-nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
+# nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
- --image_size 160 \
+#  --image_size 160 \
- --distance_metric 1 \
+#  --distance_metric 1 \
- --use_flipped_images \
+#  --use_flipped_images \
- --subtract_mean \
+#  --subtract_mean \
- --use_fixed_image_standardization \
+#  --use_fixed_image_standardization \
- --lfw_batch_size 64 2>&1 | tee result_1.log &
+#  --lfw_batch_size 64 2>&1 | tee result_1.log &
-export HIP_VISIBLE_DEVICES=2
+# export HIP_VISIBLE_DEVICES=2
-nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
+# nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
- --image_size 160 \
+#  --image_size 160 \
- --distance_metric 1 \
+#  --distance_metric 1 \
- --use_flipped_images \
+#  --use_flipped_images \
- --subtract_mean \
+#  --subtract_mean \
- --use_fixed_image_standardization \
+#  --use_fixed_image_standardization \
- --lfw_batch_size 64 2>&1 | tee result_2.log &
+#  --lfw_batch_size 64 2>&1 | tee result_2.log &
-export HIP_VISIBLE_DEVICES=3
+# export HIP_VISIBLE_DEVICES=3
-nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
+# nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
- --image_size 160 \
+#  --image_size 160 \
- --distance_metric 1 \
+#  --distance_metric 1 \
- --use_flipped_images \
+#  --use_flipped_images \
- --subtract_mean \
+#  --subtract_mean \
- --use_fixed_image_standardization \
+#  --use_fixed_image_standardization \
- --lfw_batch_size 64 2>&1 | tee result_3.log &
+#  --lfw_batch_size 64 2>&1 | tee result_3.log &
 # export HIP_VISIBLE_DEVICES=4
 # nohup python3 src/validate_on_lfw.py /datasets/lfw_mtcnnpy_160 models_m/$1 \
 #  --image_size 160 \

--- a/facenet/facenet/src/facenet.py
+++ b/facenet/facenet/src/facenet.py
@@ -123,6 +123,7 @@ RANDOM_FLIP = 4
 FIXED_STANDARDIZATION = 8
 FLIP = 16
 def create_input_pipeline(input_queue, image_size, nrof_preprocess_threads, batch_size_placeholder):
+    import pdb;pdb.set_trace()
    with tf.name_scope("tempscope"):
        images_and_labels_list = []
        for _ in range(nrof_preprocess_threads):

--- a/facenet/facenet/src/migraphx_infer.py
+++ b/facenet/facenet/src/migraphx_infer.py
+import os
+import numpy as np
+from PIL import Image  # 推荐用于读取和调整图像
+import onnxruntime as ort
+import argparse
+import lfw  # 假设你有这个模块来读取 pairs 和 paths
+import sys
+# Import metrics and interpolation functions for AUC/EER calculation
+from sklearn import metrics
+from scipy.optimize import brentq
+from scipy import interpolate
+import time
+from tqdm import tqdm
+import migraphx
+def AllocateOutputMemory(model):
+    outputData={}
+    for key in model.get_outputs().keys():
+        outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
+    return outputData
+def preprocess_image(image_path, target_size=(160, 160)):
+    img = Image.open(image_path).convert('RGB') # 确保是 RGB 三通道
+    img = img.resize(target_size, Image.Resampling.BILINEAR) # 或 Image.LANCZOS
+    img_np = np.array(img, dtype=np.float32) 
+    img_np = (img_np - 127.5) / 128.0
+    return img_np
+def load_lfw_for_onnx(lfw_dir, pairs_file, batch_size, image_size=(160, 160)):
+    pairs = lfw.read_pairs(os.path.expanduser(pairs_file))
+    paths, actual_issame = lfw.get_paths(os.path.expanduser(lfw_dir), pairs)
+    nrof_pairs = len(actual_issame)
+    all_processed_images = []
+    all_labels = []
+    all_actual_issame_full = []
+    for i in range(nrof_pairs):
+        path0 = paths[i*2]
+        path1 = paths[i*2+1]
+        actual_same = actual_issame[i]
+        # Process each image in the pair
+        for img_path in [path0, path1]:
+            label = os.path.basename(os.path.dirname(img_path))
+            # Original image
+            processed_img = preprocess_image(
+                img_path, 
+                target_size=image_size, 
+            )
+            processed_img = np.transpose(processed_img, (2, 0, 1))
+            processed_img = np.ascontiguousarray(processed_img)
+            all_processed_images.append(processed_img)
+            all_labels.append(label)
+            all_actual_issame_full.append(actual_same) 
+    num_batches = len(all_processed_images) // batch_size
+    if len(all_processed_images) % batch_size != 0:
+        print(f"Warning: Number of images ({len(all_processed_images)}) is not evenly divisible by batch size ({batch_size}). Last batch will be smaller.")
+        num_batches += 1
+    for i in range(num_batches):
+        start_idx = i * batch_size
+        end_idx = min(start_idx + batch_size, len(all_processed_images))
+        batch_images = all_processed_images[start_idx:end_idx]
+        batch_labels = all_labels[start_idx:end_idx]
+        batch_actual_issame_part = all_actual_issame_full[start_idx:end_idx]
+        batch_array = np.stack(batch_images, axis=0)
+        batch_array = batch_array.astype(np.float32)
+        yield batch_array, batch_labels, batch_actual_issame_part
+def main_onnx(args):
+    embedding_size = 512
+    pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
+    paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs)
+    nrof_pairs = len(actual_issame)
+    nrof_images = nrof_pairs * 2
+    print(f"Number of pairs: {nrof_pairs}, Number of images: {nrof_images}, Embedding size: {embedding_size}")
+    data_generator = load_lfw_for_onnx(
+        args.lfw_dir, 
+        args.lfw_pairs, 
+        args.lfw_batch_size, 
+        image_size=(args.image_size, args.image_size),
+    )
+    total_images_calculated = len(paths)
+    num_batches_calculated = total_images_calculated // args.lfw_batch_size
+    if total_images_calculated % args.lfw_batch_size != 0:
+        num_batches_calculated += 1
+    all_embeddings = np.zeros((nrof_images, embedding_size), dtype=np.float32)
+    current_image_index = 0
+    model = migraphx.load(args.model_path)
+    inputName=list(model.get_inputs().keys())[0]
+    modelData=AllocateOutputMemory(model)
+    # warm up
+    modelData[inputName] = migraphx.to_gpu(migraphx.argument(np.ones([64,3,160,160]).astype(np.float32)))
+    model.run(modelData)
+    infer_times = []
+    total_infer_times = []
+    total_start = time.time()
+    for i, (batch_images, _, _) in enumerate(tqdm(data_generator, total=num_batches_calculated, desc="Processing Batches")):
+        original_batch_size = batch_images.shape[0]
+        if original_batch_size < 64:
+            pad_size = 64 - original_batch_size
+            padding_images = np.repeat(batch_images[-1:], pad_size, axis=0)
+            batch_images = np.concatenate((batch_images, padding_images), axis=0)
+        modelData[inputName] = migraphx.to_gpu(migraphx.argument(batch_images))
+        start = time.time()
+        embeddings_dcu = model.run(modelData)
+        embeddings_1 = np.array(migraphx.from_gpu(embeddings_dcu[0]))
+        infer_time_taken = time.time() - start
+        infer_times.append(infer_time_taken)
+        if original_batch_size == 64:
+            embeddings = embeddings_1
+        else:
+            embeddings = embeddings_1[:original_batch_size]
+        batch_size_current = embeddings.shape[0]
+        all_embeddings[current_image_index:current_image_index + batch_size_current] = embeddings
+        current_image_index += batch_size_current # Move index forward
+        if i % 10 == 9:
+            print('.', end='')
+            sys.stdout.flush()
+        total_infer_times.append(time.time() - total_start)
+        total_start = time.time() # Reset timer for next batch's data loading + inference
+    print("\nAll batches processed.")
+    nrof_embeddings = nrof_pairs * 2 
+    final_embeddings = np.zeros((nrof_embeddings, embedding_size), dtype=np.float32)  
+    final_embeddings = all_embeddings 
+    distance_metric = 1 # Euclidean
+    subtract_mean = True
+    nrof_folds = 10    
+    tpr, fpr, accuracy, val, val_std, far = lfw.evaluate(
+        final_embeddings, 
+        actual_issame, 
+        nrof_folds=nrof_folds, 
+        distance_metric=distance_metric, 
+        subtract_mean=subtract_mean
+    )
+    print('Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy)))
+    print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
+    auc = metrics.auc(fpr, tpr)
+    print('Area Under Curve (AUC): %1.3f' % auc)
+    eer = brentq(lambda x: 1. - x - interpolate.interp1d(fpr, tpr)(x), 0., 1.)
+    print('Equal Error Rate (EER): %1.3f' % eer)
+    print("***************************")
+    infer_time = sum(infer_times)
+    avg_infer_fps = 64 * len(infer_times) / sum(infer_times)
+    print(f"total_infer_time: {infer_time}s")
+    print(f'avg_infer_fps: {avg_infer_fps}samples/s')
+    load_data_infer_time = sum(total_infer_times)
+    load_data_avg_infer_fps = len(total_infer_times) * 64 / sum(total_infer_times)
+    print(f'load_data_total_infer_time: {load_data_infer_time}s')
+    print(f'load_data_avg_total_Infer_fps: {load_data_avg_infer_fps} samples/s')
+    print("******************************")
+def parse_arguments_onnx():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--lfw_dir', type=str, default="/datasets/lfw_mtcnnpy_160", help='Path to the data directory containing aligned LFW face patches.')
+    parser.add_argument('--lfw_batch_size', type=int, help='Number of images to process in a batch in the LFW test set.', default=64) # Changed default to common ONNX batch size
+    parser.add_argument('--model_path', type=str, default="/home/sunzhq/workspace/yidong-infer/facenet/facenet/tools/onnx-models/facenet_static_bs64.mxr", help='Path to the ONNX model file.')
+    parser.add_argument('--image_size', type=int, help='Image size (height, width) in pixels.', default=160)
+    parser.add_argument('--lfw_pairs', type=str, help='The file containing the pairs to use for validation.', default='data/pairs.txt')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse_arguments_onnx()
+    main_onnx(args)
--- a/facenet/facenet/src/migraphx_infer_with_filp.py
+++ b/facenet/facenet/src/migraphx_infer_with_filp.py
+import os
+import numpy as np
+from PIL import Image
+import argparse
+import lfw
+import sys
+from sklearn import metrics
+from scipy.optimize import brentq
+from scipy import interpolate
+import time
+from tqdm import tqdm
+import migraphx
+def AllocateOutputMemory(model):
+    outputData={}
+    for key in model.get_outputs().keys():
+        outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
+    return outputData
+def evaluate_embeddings_with_different_methods(embeddings, actual_issame, use_flipped_images, embedding_size):
+    """用不同方法评估嵌入向量"""
+    results = {}
+    nrof_pairs = len(actual_issame)
+    # 方法0: 原始方法（不使用翻转）
+    if not use_flipped_images:
+        tpr, fpr, accuracy, val, val_std, far = lfw.evaluate(
+            embeddings,
+            actual_issame,
+            nrof_folds=10,
+            distance_metric=1,
+            subtract_mean=True
+        )
+        results["original"] = {
+            "accuracy": np.mean(accuracy),
+            "std": np.std(accuracy),
+            "auc": metrics.auc(fpr, tpr)
+        }
+    # 方法1: 原始TF的拼接方式
+    elif embeddings.shape[0] == nrof_pairs * 4:  # 有翻转图像
+        # 方法1A: 原始 + 翻转
+        final_embeddings = np.zeros((nrof_pairs * 2, embedding_size * 2))
+        final_embeddings[:, :embedding_size] = embeddings[0::2]
+        final_embeddings[:, embedding_size:] = embeddings[1::2]
+        tpr, fpr, accuracy, val, val_std, far = lfw.evaluate(
+            final_embeddings,
+            actual_issame,
+            nrof_folds=10,
+            distance_metric=1,
+            subtract_mean=True
+        )
+        results["original+flipped"] = {
+            "accuracy": np.mean(accuracy),
+            "std": np.std(accuracy),
+            "auc": metrics.auc(fpr, tpr)
+        }
+    return results
+def main_optimized(args):
+    """优化后的主函数"""
+    # 加载模型
+    model = migraphx.load(args.migraphx_model_path)
+    input_name = list(model.get_inputs().keys())[0]
+    modelData=AllocateOutputMemory(model)
+    embedding_size=512
+    print("="*70)
+    # 加载数据
+    pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
+    paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs)
+    nrof_pairs = len(actual_issame)
+    # 准备所有图像路径和翻转标志
+    all_image_paths = []
+    flip_flags = []
+    print("\nPreparing image paths...")
+    for i in tqdm(range(nrof_pairs), desc="Organizing pairs"):
+        path0 = paths[i*2]
+        path1 = paths[i*2+1]
+        # 第一张图像
+        all_image_paths.append(path0)
+        flip_flags.append(False)        
+        if args.use_flipped_images:
+            all_image_paths.append(path0)
+            flip_flags.append(True)
+        # 第二张图像
+        all_image_paths.append(path1)
+        flip_flags.append(False)        
+        if args.use_flipped_images:
+            all_image_paths.append(path1)
+            flip_flags.append(True)
+    nrof_images = len(all_image_paths)
+    print(f"Total images to process: {nrof_images}")
+    # 预分配嵌入向量存储
+    all_embeddings = np.zeros((nrof_images, embedding_size), dtype=np.float32)
+    # 推理
+    print("\nRunning inference...")
+    infer_times = []
+    for start_idx in tqdm(range(0, nrof_images, args.lfw_batch_size), desc="Processing"):
+        end_idx = min(start_idx + args.lfw_batch_size, nrof_images)
+        batch_paths = all_image_paths[start_idx:end_idx]
+        batch_flip_flags = flip_flags[start_idx:end_idx]
+        # 预处理批次
+        batch_images = []
+        for img_path, flip_flag in zip(batch_paths, batch_flip_flags):
+            # 使用PIL读取和预处理
+            img = Image.open(img_path).convert('RGB')
+            img = img.resize((args.image_size, args.image_size), Image.Resampling.BILINEAR)
+            img_np = np.array(img, dtype=np.float32)
+            if flip_flag:
+                img_np = np.fliplr(img_np)
+            # FaceNet标准化
+            img_np = (img_np - 127.5) / 128.0
+            # CHW格式
+            img_np = np.transpose(img_np, (2, 0, 1))
+            batch_images.append(img_np)
+        batch_array = np.stack(batch_images, axis=0).astype(np.float32)
+        # 为MIGraphX准备批次
+        if batch_array.shape[0] < 64:
+            pad_size = 64 - batch_array.shape[0]
+            padding = np.repeat(batch_array[-1:], pad_size, axis=0)
+            batch_for_infer = np.concatenate([batch_array, padding], axis=0)
+        else:
+            batch_for_infer = batch_array
+        # 转换为MIGraphX参数
+        batch_for_infer = np.ascontiguousarray(batch_for_infer)
+        # mgx_arg = migraphx.to_gpu(migraphx.argument(batch_for_infer))
+        # model_data = {input_name: mgx_arg}
+        modelData[input_name] = migraphx.to_gpu(migraphx.argument(batch_for_infer))
+        # 推理
+        infer_start = time.time()
+        output = model.run(modelData)
+        infer_time = time.time() - infer_start
+        infer_times.append(infer_time)
+        # 提取嵌入向量
+        embeddings_np = np.array(migraphx.from_gpu(output[0]))
+        if batch_array.shape[0] < 64:
+            embeddings_np = embeddings_np[:batch_array.shape[0]]
+        all_embeddings[start_idx:end_idx] = embeddings_np
+    print("\n" + "="*70)
+    print("EVALUATION RESULTS")
+    print("="*70)
+    # 使用不同方法评估
+    results = evaluate_embeddings_with_different_methods(
+        all_embeddings, 
+        actual_issame, 
+        args.use_flipped_images, 
+        embedding_size
+    )
+    # 打印结果
+    print("\nComparison of different methods:")
+    print("-"*70)
+    for method_name, result in results.items():
+        print(f"{method_name:20} | Accuracy: {result['accuracy']:.5f} ± {result['std']:.5f} | AUC: {result['auc']:.3f}")
+    # 性能统计
+    if infer_times:
+        total_infer_time = sum(infer_times)
+        avg_fps = nrof_images / total_infer_time
+        print("\n" + "="*70)
+        print("PERFORMANCE STATISTICS")
+        print("-"*70)
+        print(f"Total inference time: {total_infer_time:.3f}s")
+        print(f"Average FPS: {avg_fps:.1f} images/s")
+        print(f"Number of images: {nrof_images}")
+        if args.use_flipped_images:
+            print(f"  (Note: {nrof_pairs * 2} original images + their flips)")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--lfw_dir', type=str, default="/datasets/lfw_mtcnnpy_160")
+    parser.add_argument('--lfw_batch_size', type=int, default=64)
+    parser.add_argument('--migraphx_model_path', type=str, 
+                       default="/home/sunzhq/workspace/yidong-infer/facenet/facenet/tools/onnx-models/facenet_static_bs64_fp32.mxr")
+    parser.add_argument('--image_size', type=int, default=160)
+    parser.add_argument('--lfw_pairs', type=str, default='data/pairs.txt')
+    parser.add_argument('--use_flipped_images', action='store_true')
+    parser.add_argument('--use_fixed_image_standardization', action='store_true')
+    args = parser.parse_args()
+    main_optimized(args)
\ No newline at end of file
--- a/facenet/facenet/tools/check_onnx.py
+++ b/facenet/facenet/tools/check_onnx.py
+import onnx
+# 加载 ONNX 模型
+model_path = "/home/sunzhq/workspace/yidong-infer/facenet/facenet/tools/onnx-models/facenet_static_bs64.onnx" # 请替换为您的实际 .onnx 文件路径
+model = onnx.load(model_path)
+# 检查模型是否有效
+onnx.checker.check_model(model)
+# 获取图 (graph) 信息
+graph = model.graph
+print("--- Model Info ---")
+print(f"Model Name: {model.producer_name or 'Unknown'}")
+print(f"ONNX Version: {model.ir_version}")
+print("\n--- Input Information ---")
+for input_tensor in graph.input:
+    print(f"Name: {input_tensor.name}")
+    print(f"Type: {input_tensor.type.tensor_type.elem_type}")
+    # 解析 shape
+    shape_dim = [dim.dim_param if dim.dim_param else dim.dim_value for dim in input_tensor.type.tensor_type.shape.dim]
+    print(f"Shape: {shape_dim}")
+    print("-" * 20)
+print("\n--- Output Information ---")
+for output_tensor in graph.output:
+    print(f"Name: {output_tensor.name}")
+    print(f"Type: {output_tensor.type.tensor_type.elem_type}")
+    # 解析 shape
+    shape_dim = [dim.dim_param if dim.dim_param else dim.dim_value for dim in output_tensor.type.tensor_type.shape.dim]
+    print(f"Shape: {shape_dim}")
+    print("-" * 20)
+# 如果你想查看所有节点 (nodes) 的概览 (可选)
+# print("\n--- Node Overview ---")
+# for i, node in enumerate(graph.node[:5]): # 只打印前5个节点作为示例
+#     print(f"Node {i}: {node.op_type} -> {node.output[0]} (inputs: {node.input})")
+# if len(graph.node) > 5:
+#     print(f"... and {len(graph.node) - 5} more nodes")
\ No newline at end of file
--- a/facenet/facenet/tools/onnx_2_mxr.sh
+++ b/facenet/facenet/tools/onnx_2_mxr.sh
+migraphx-driver compile ./onnx-models/facenet_static_bs64.onnx \
+    --binary --fp16 \
+    --output ./onnx-models/facenet_static_bs64.mxr \
+    --input-dim @input:0 64 3 160 160
+    # --input-dim @phase_train:0 0
\ No newline at end of file
--- a/facenet/facenet/tools/pb_frozen_del_phase_train.py
+++ b/facenet/facenet/tools/pb_frozen_del_phase_train.py
+import tensorflow as tf
+from google.protobuf import text_format
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.framework import graph_util
+import numpy as np
+def fix_phase_train_and_save(input_pb_path, output_pb_path_fixed):
+    with tf.io.gfile.GFile(input_pb_path, 'rb') as f:
+        graph_def = tf.compat.v1.GraphDef()
+        graph_def.ParseFromString(f.read())
+    new_graph_def = tf.compat.v1.GraphDef()
+    for node in graph_def.node:
+        if node.name == 'phase_train' or (node.name == 'phase_train:0' and node.op == 'Placeholder'):
+             print(f"Found phase_train node: Name='{node.name}', Op='{node.op}', Dtype={node.attr['dtype'].type}")
+             print(f"  - Note: Actual output tensor name is likely '{node.name}:0'")
+             from tensorflow.core.framework import node_def_pb2, attr_value_pb2, tensor_pb2, types_pb2
+             from tensorflow.core.framework import tensor_shape_pb2
+             from tensorflow.python.framework import tensor_util
+             const_node = node_def_pb2.NodeDef()
+             const_node.op = "Const"
+             const_node.name = node.name # Use the same name ('phase_train')
+             if node.device:
+                 const_node.device = node.device
+             const_node.attr["dtype"].CopyFrom(node.attr["dtype"]) # Should be DT_BOOL (types_pb2.DT_BOOL)
+             false_tensor = tensor_pb2.TensorProto(
+                 dtype=types_pb2.DT_BOOL,
+                 bool_val=[False],
+                 # tensor_shape=scalar_tensor_shape 
+             )
+             false_tensor.ClearField('tensor_shape') 
+             const_node.attr["value"].CopyFrom(attr_value_pb2.AttrValue(tensor=false_tensor))
+             new_graph_def.node.extend([const_node])
+             print(f"Replaced '{node.name}' with a scalar Const node having value False.")
+        else:
+            new_graph_def.node.extend([node])
+    output_node_name_without_port = 'embeddings' 
+    try:
+        new_graph_def = graph_util.remove_training_nodes(
+            input_graph_def=new_graph_def,
+            protected_nodes=[] 
+        )
+        print(f"Applied remove_training_nodes optimization.")
+    except Exception as e:
+        print(f"Warning: Could not apply remove_training_nodes: {e}. Proceeding with current graph_def.")
+    # Save the modified graph
+    with tf.io.gfile.GFile(output_pb_path_fixed, 'wb') as f:
+        f.write(new_graph_def.SerializeToString())
+    print(f"Modified .pb saved to: {output_pb_path_fixed}")
+input_pb = "/home/sunzhq/workspace/yidong-infer/facenet/facenet/models_m/facenet-tmp/20180408-102900.pb"
+fixed_pb = "/home/sunzhq/workspace/yidong-infer/facenet/facenet/models_m/facenet-tmp/20180408-102900_fixed_scalar.pb"
+fix_phase_train_and_save(input_pb, fixed_pb)
+print("\nNow run tf2onnx on the fixed .pb file (with scalar phase_train):")
+print(f"python -m tf2onnx.convert \\")
+print(f"  --input {fixed_pb} \\")
+print(f"  --inputs \"input:0[64,160,160,3]\" \\")
+print(f"  --outputs embeddings:0 \\") # No more phase_train input needed
+print(f"  --output ./onnx-models/facenet_static_bs64.onnx \\")
+print(f"  --opset 11")
\ No newline at end of file
--- a/facenet/facenet/tools/tf_2_onnx.sh
+++ b/facenet/facenet/tools/tf_2_onnx.sh
+python -m tf2onnx.convert \
+    --input /home/sunzhq/workspace/yidong-infer/facenet/facenet/models_m/facenet-tmp/20180408-102900_fixed_scalar.pb \
+    --output ./onnx-models/facenet_static_bs64.onnx \
+    --inputs input:0[64,160,160,3] \
+    --outputs embeddings:0 \
+    --opset 15 \
+    --inputs-as-nchw input:0
+# python -m tf2onnx.convert \
+#   --input /home/sunzhq/workspace/yidong-infer/facenet/facenet/models_m/facenet-tmp/20180408-102900.pb \
+#   --inputs "input:0[64,160,160,3]" \
+#   --inputs "phase_train:0False" \
+#   --outputs embeddings:0 \
+#   --output ./onnx-models/facenet_static_bs64.onnx \
+#   --opset 11
+# python -m tf2onnx.convert --input frozen_model.pb --output model.onnx --inputs image_paths:0 --outputs embeddings:0
\ No newline at end of file
--- a/resnet/mmpretrain-main/onnx_inference.sh
+++ b/resnet/mmpretrain-main/onnx_inference.sh
@@ -3,18 +3,18 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
 # /datasets/cifar100/cifar-100-python/test
 export HIP_VISIBLE_DEVICES=0
-# numactl -N 0 -m 0 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True
+numactl -N 0 -m 0 python resnet50_migraphx.py --model ./resnet50.onnx --dataset /datasets/cifar100/test --batch_size 24 --fp16 True
-nohup numactl -N 0 -m 0 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_0.log &
+# nohup numactl -N 0 -m 0 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_0.log &
-export HIP_VISIBLE_DEVICES=1
+# export HIP_VISIBLE_DEVICES=1
-nohup numactl -N 1 -m 1 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_1.log &
+# nohup numactl -N 1 -m 1 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_1.log &
-export HIP_VISIBLE_DEVICES=2
+# export HIP_VISIBLE_DEVICES=2
-nohup numactl -N 2 -m 2 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_2.log &
+# nohup numactl -N 2 -m 2 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_2.log &
-export HIP_VISIBLE_DEVICES=3
+# export HIP_VISIBLE_DEVICES=3
-nohup numactl -N 3 -m 3 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_3.log &
+# nohup numactl -N 3 -m 3 python resnet50_migraphx.py --model ./resnet50.onnx --dataset ./datasets/cifar100/test --batch_size 24 --fp16 True 2>&1 | tee result_3.log &
 # export HIP_VISIBLE_DEVICES=4
 # nohup python resnet50_migraphx.py --model /workspace/mmpretrain-main/resnet50.onnx --dataset /datasets/cifar100/ --batch_size 24 --fp16 True 2>&1 | tee result_4.log &

--- a/resnet/mmpretrain-main/resnet50_migraphx.py
+++ b/resnet/mmpretrain-main/resnet50_migraphx.py
 import argparse
+import argparse
 import cv2
 import numpy as np