init

24b257f1 · sunzhq2 · 920b3c0f · 24b257f1 · 24b257f1 · 24b257f1
Commit 24b257f1 authored Nov 19, 2024 by sunzhq2
20 changed files
--- a/ByteMLPerf/byte_micro_perf/backends/module_store.py
+++ b/ByteMLPerf/byte_micro_perf/backends/module_store.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+import torch
+import torch.distributed as dist
+def gemm_compute_size(input_shapes, torch_dtype):
+    # input_shapes: [[M, K], [K, N]]
+    a_shape, b_shape = input_shapes
+    M, _ = a_shape
+    _, N = b_shape
+    d_shape = [M, N]
+    # get element_size and dtype_size
+    input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    if torch_dtype == torch.int8:
+        output_tensor_size = 4 * output_element_num
+    else:
+        output_tensor_size = dtype_size * output_element_num
+    batch_size = M
+    tensor_size = input_tensor_size + output_tensor_size
+    return (batch_size, tensor_size, input_tensor_size, output_tensor_size)
+def gemm_create_tensors(input_shapes, torch_dtype, xpu_device):
+    # input_shapes: [[M, K], [K, N]]
+    a_shape, b_shape = input_shapes
+    M, _ = a_shape
+    _, N = b_shape
+    d_shape = [M, N]
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    b_tensor = torch.randint(0, 7, b_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    d_tensor = torch.randint(0, 7, d_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor, b_tensor, d_tensor]
+def batch_gemm_compute_size(input_shapes, torch_dtype):
+    # input_shapes: [[bs, M, K], [bs, K, N]]
+    a_shape, b_shape = input_shapes
+    bs, M, _ = a_shape
+    bs, _, N = b_shape
+    d_shape = [bs, M, N]
+    # get element_size and dtype_size
+    input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    if torch_dtype == torch.int8:
+        output_tensor_size = 4 * output_element_num
+    else:
+        output_tensor_size = dtype_size * output_element_num
+    batch_size = bs
+    tensor_size = input_tensor_size + output_tensor_size
+    return (batch_size, tensor_size, input_tensor_size, output_tensor_size)
+def batch_gemm_create_tensors(input_shapes, torch_dtype, xpu_device):
+    # input_shapes: [[bs, M, K], [bs, K, N]]
+    a_shape, b_shape = input_shapes
+    bs, M, _ = a_shape
+    bs, _, N = b_shape
+    d_shape = [bs, M, N]
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    b_tensor = torch.randint(0, 7, b_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    d_tensor = torch.randint(0, 7, d_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor, b_tensor, d_tensor]
+def group_gemm_compute_size(input_shapes, torch_dtype):
+    """
+    [
+        [[M1, K1], [K1, N1]], 
+        [[M2, K2], [K2, N2]]
+    ]
+    """
+    input_tensor_size = 0
+    output_tensor_size = 0
+    for problem_shape in input_shapes:
+        a_shape, b_shape = problem_shape
+        M, _ = a_shape
+        _, N = b_shape
+        d_shape = [M, N]
+        # get element_size and dtype_size
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+        input_tensor_size += dtype_size * input_element_num
+        if torch_dtype == torch.int8:
+            output_tensor_size += 4 * output_element_num
+        else:
+            output_tensor_size += dtype_size * output_element_num
+        batch_size = 1
+        tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def group_gemm_create_tensors(input_shapes, torch_dtype, xpu_device):
+    """
+    [
+        [[M1, K1], [K1, N1]],
+        [[M2, K2], [K2, N2]]
+    ]
+    """
+    left_tensors = []
+    right_tensors = []
+    output_tensors = []
+    for problem_shape in input_shapes:
+        a_shape, b_shape = problem_shape
+        M, _ = a_shape
+        _, N = b_shape
+        d_shape = [M, N]
+        # create input tensors
+        left_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+        right_tensor = torch.randint(0, 7, b_shape, dtype=torch_dtype, device=xpu_device)
+        # create output tensors
+        output_tensor = torch.randint(0, 7, d_shape, dtype=torch_dtype, device=xpu_device)
+        left_tensors.append(left_tensor)
+        right_tensors.append(right_tensor)
+        output_tensors.append(output_tensor)
+    return [left_tensors, right_tensors, output_tensors]
+def sin_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    c_shape = a_shape
+    input_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [c_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num
+    batch_size = c_shape[0]
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def sin_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    c_shape = a_shape
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor, c_tensor]
+def cast_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    c_shape = a_shape
+    input_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [c_shape]])
+    if torch_dtype == torch.float32:
+        dst_torch_dtype = torch.bfloat16
+    elif torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
+        dst_torch_dtype = torch.float32
+    elif torch_dtype == torch.int8:
+        dst_torch_dtype = torch.int32
+    else:
+        dst_torch_dtype = torch_dtype
+    src_dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    dst_dtype_size = torch.tensor([], dtype=dst_torch_dtype).element_size()
+    input_tensor_size = src_dtype_size * input_element_num
+    output_tensor_size = dst_dtype_size * output_element_num
+    batch_size = c_shape[0]
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def cast_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    c_shape = a_shape
+    if torch_dtype == torch.float32:
+        dst_torch_dtype = torch.bfloat16
+    elif torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
+        dst_torch_dtype = torch.float32
+    elif torch_dtype == torch.int8:
+        dst_torch_dtype = torch.int32
+    else:
+        dst_torch_dtype = torch_dtype  
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    c_tensor = torch.randint(0, 7, c_shape, dtype=dst_torch_dtype, device=xpu_device)
+    return [a_tensor, c_tensor]
+def swiglu_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    input_tensor_shape = [batch_size, hidden_size]
+    output_tensor_shape = [batch_size, hidden_size]
+    input_element_num = sum([math.prod(shape) for shape in [input_tensor_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [output_tensor_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def swiglu_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    input_tensor_shape = [batch_size, hidden_size]
+    output_tensor_shape = [batch_size, hidden_size]
+    # create input tensors
+    input_tensor = torch.randint(0, 7, input_tensor_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    output_tensor = torch.randint(0, 7, output_tensor_shape, dtype=torch_dtype, device=xpu_device)
+    return [input_tensor, output_tensor]
+def add_compute_size(input_shapes, torch_dtype):
+    a_shape, b_shape = input_shapes
+    c_shape = a_shape
+    batch_size, hidden_size = a_shape
+    input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [c_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def add_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, b_shape = input_shapes
+    c_shape = a_shape
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    b_tensor = torch.randint(0, 7, b_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor, b_tensor, c_tensor]
+def layer_norm_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = a_shape
+    w_shape = a_shape[-1:]
+    input_element_num = sum([math.prod(shape) for shape in [a_shape, w_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [c_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def layer_norm_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = a_shape
+    w_shape = a_shape[-1:]
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
+    # create weight tensors
+    w_tensor = torch.randint(0, 7, w_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor, c_tensor, w_tensor]
+def softmax_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = a_shape
+    input_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [c_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def softmax_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = a_shape
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor, c_tensor]
+def reduce_sum_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = [batch_size, 1]
+    input_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [c_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def reduce_sum_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = [batch_size, 1]
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor, c_tensor]
+def reduce_min_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    values_shape = [batch_size, 1]
+    indices_shape = [batch_size, 1]
+    input_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    values_element_num = sum([math.prod(shape) for shape in [values_shape]])
+    indices_element_num = sum([math.prod(shape) for shape in [indices_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    indices_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * values_element_num + indices_dtype_size * indices_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def reduce_min_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    values_shape = [batch_size, 1]
+    indices_shape = [batch_size, 1]
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    values_tensor = torch.randint(0, 7, values_shape, dtype=torch_dtype, device=xpu_device)
+    indices_tensor = torch.randint(0, 7, indices_shape, dtype=torch.int64, device=xpu_device)
+    return [a_tensor, values_tensor, indices_tensor]
+def index_add_compute_size(input_shapes, torch_dtype):
+    # src_tensor -->(index_tensor) dst_tensor
+    dst_shape, src_shape = input_shapes
+    src_batch_size = src_shape[0]
+    dst_batch_size = dst_shape[0]
+    index_shape = [src_batch_size]
+    src_element_num = sum([math.prod(shape) for shape in [src_shape]])
+    index_element_num = sum([math.prod(shape) for shape in [index_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    index_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
+    src_tensor_size = dtype_size * src_element_num
+    index_tensor_size = index_dtype_size * index_element_num
+    input_tensor_size = 2 * src_tensor_size + index_tensor_size
+    output_tensor_size = src_tensor_size
+    tensor_size = input_tensor_size + output_tensor_size
+    return src_batch_size, tensor_size, input_tensor_size, output_tensor_size
+def index_add_create_tensors(input_shapes, torch_dtype, xpu_device):
+    # src_tensor -->(index_tensor) dst_tensor
+    dst_shape, src_shape = input_shapes
+    src_batch_size = src_shape[0]
+    dst_batch_size = dst_shape[0]
+    index_shape = [src_batch_size]
+    # create output tensors
+    dst_tensor = torch.randint(0, 7, dst_shape, dtype=torch_dtype, device=xpu_device)
+    # create input tensors
+    src_tensor = torch.randint(0, 7, src_shape, dtype=torch_dtype, device=xpu_device)
+    index_tensor = torch.randint(0, dst_batch_size, index_shape, dtype=torch.int64, device=xpu_device)
+    return [dst_tensor, src_tensor, index_tensor]
+def sort_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = a_shape
+    input_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [c_shape]])
+    indice_element_num  = output_element_num
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    indice_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num + indice_dtype_size * indice_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def sort_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = a_shape
+    # create input tensors
+    a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
+    # create output tensors
+    c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
+    indice_tensor = torch.randint(0, 7, c_shape, dtype=torch.int64, device=xpu_device)
+    return [a_tensor, c_tensor, indice_tensor]
+def unique_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = a_shape
+    input_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [c_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    indice_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num + indice_dtype_size * output_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def unique_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    c_shape = a_shape
+    # create input tensors
+    torch.manual_seed(1)
+    a_tensor = torch.randint(0, 1024, a_shape, dtype=torch_dtype, device="cpu").to(device=xpu_device)
+    # create output tensors
+    c_tensor = torch.empty(c_shape, dtype=torch_dtype, device=xpu_device)
+    count_tensor = torch.empty(c_shape, dtype=torch.int64, device=xpu_device)
+    return [a_tensor, c_tensor, count_tensor]
+def scatter_compute_size(input_shapes, torch_dtype):
+    tensor_shape = input_shapes[0]
+    batch_size, hidden_size = tensor_shape
+    index_shape = [batch_size]
+    input_element_num = sum([math.prod(shape) for shape in [tensor_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [tensor_shape]])
+    index_element_num = sum([math.prod(shape) for shape in [index_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    index_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
+    input_element_num = dtype_size * input_element_num + index_dtype_size * index_element_num
+    output_element_num = dtype_size * output_element_num
+    tensor_size = input_element_num + output_element_num
+    return batch_size, tensor_size, input_element_num, output_element_num
+def scatter_create_tensors(input_shapes, torch_dtype, xpu_device):
+    tensor_shape = input_shapes[0]
+    batch_size, hidden_size = tensor_shape
+    index_shape = [batch_size]
+    # create output tensors
+    dst_tensor = torch.randint(0, 7, tensor_shape, dtype=torch_dtype, device=xpu_device)
+    # create input tensors
+    src_tensor = torch.randint(0, 7, tensor_shape, dtype=torch_dtype, device=xpu_device)
+    index = [i for i in range(batch_size)]
+    random.shuffle(index)
+    index_tensor = torch.tensor(index, dtype=torch.int64, device=xpu_device)
+    index_tensor = index_tensor.reshape(-1, 1).expand(-1, hidden_size)
+    return [dst_tensor, src_tensor, index_tensor]
+def host2device_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    output_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = output_tensor_size
+    return batch_size, tensor_size, 0, output_tensor_size
+def host2device_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    host_tensor = torch.empty(a_shape, dtype=torch_dtype, device="cpu").pin_memory()
+    device_tensor = torch.empty(a_shape, dtype=torch_dtype, device=xpu_device)
+    return [host_tensor, device_tensor]
+def allreduce_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    a_tensor = torch.zeros(a_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor]
+def allgather_compute_size(input_shapes, torch_dtype):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    output_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = output_tensor_size
+    return batch_size, tensor_size, 0, output_tensor_size
+def allgather_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, = input_shapes
+    batch_size, hidden_size = a_shape
+    world_size = dist.get_world_size()
+    tensor = torch.empty([batch_size, hidden_size], dtype=torch_dtype, device=xpu_device)
+    tensors = list(torch.chunk(tensor, world_size, dim=0))
+    return [tensors]
+def alltoall_compute_size(input_shapes, torch_dtype):
+    a_shape, b_shape = input_shapes
+    batch_size, hidden_size = a_shape
+    world_size = dist.get_world_size()
+    output_element_num = sum([math.prod(shape) for shape in [a_shape]]) * 2
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = output_tensor_size
+    return batch_size, tensor_size, 0, output_tensor_size
+def alltoall_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, b_shape = input_shapes
+    batch_size, hidden_size = a_shape
+    world_size = dist.get_world_size()
+    input_tensor = torch.empty([batch_size, hidden_size], dtype=torch_dtype, device=xpu_device)
+    input_tensors = list(torch.chunk(input_tensor, world_size, dim=0))
+    output_tensor = torch.empty([batch_size, hidden_size], dtype=torch_dtype, device=xpu_device)
+    output_tensors = list(torch.chunk(output_tensor, world_size, dim=0))
+    return [input_tensors, output_tensors]
+def p2p_compute_size(input_shapes, torch_dtype):
+    a_shape, b_shape = input_shapes
+    batch_size, hidden_size = a_shape
+    input_element_num = sum([math.prod(shape) for shape in [a_shape]])
+    output_element_num = sum([math.prod(shape) for shape in [b_shape]])
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    input_tensor_size = dtype_size * input_element_num
+    output_tensor_size = dtype_size * output_element_num
+    tensor_size = input_tensor_size + output_tensor_size
+    return batch_size, tensor_size, input_tensor_size, output_tensor_size
+def p2p_create_tensors(input_shapes, torch_dtype, xpu_device):
+    a_shape, b_shape = input_shapes
+    batch_size, hidden_size = a_shape
+    a_tensor = torch.empty(a_shape, dtype=torch_dtype, device=xpu_device)
+    b_tensor = torch.empty(b_shape, dtype=torch_dtype, device=xpu_device)
+    return [a_tensor, b_tensor]
+"""
+gemm ops
+"""
+class GemmOp(torch.nn.Module):
+    def forward(self, input_tensor_a, input_tensor_b, input_tensor_d):
+        compute_dtype = input_tensor_a.dtype
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            torch.mm(input_tensor_a, input_tensor_b, out=input_tensor_d)
+        else:
+            raise Exception(f"GemmOp with dtype {compute_dtype} is not implemented")
+class BatchGemmOp(torch.nn.Module):
+    def forward(self, input_tensor_a, input_tensor_b, input_tensor_d):
+        compute_dtype = input_tensor_a.dtype
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            torch.bmm(input_tensor_a, input_tensor_b, out=input_tensor_d)
+        else:
+            raise Exception(f"BatchGemmOp with dtype {compute_dtype} is not implemented")
+class GroupGemmOp(torch.nn.Module):
+    def forward(self, input_tensor_a, input_tensor_b, input_tensor_d):
+        compute_dtype = input_tensor_a[0].dtype
+        for a, b, d in zip(input_tensor_a, input_tensor_b, input_tensor_d):
+            if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                torch.mm(a, b, out=d)
+            else:
+                raise Exception(f"GroupGemmOp with dtype {compute_dtype} is not implemented")
+"""
+unary ops
+"""
+class SinOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        torch.sin(input_tensor, out=output_tensor)
+class CosOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        torch.cos(input_tensor, out=output_tensor)
+class ExpOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        torch.exp(input_tensor, out=output_tensor)
+class ExponentialOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        input_tensor.exponential_()
+class LogOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        torch.log(input_tensor, out=output_tensor)
+class SqrtOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        torch.sqrt(input_tensor, out=output_tensor)
+class CastOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        output_tensor = input_tensor.to(output_tensor.dtype)
+class SiluOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        output_tensor = torch.nn.functional.silu(input_tensor)
+class GeluOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        output_tensor = torch.nn.functional.gelu(input_tensor)
+class SwiGLUOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        torch.mul(torch.nn.functional.silu(input_tensor), input_tensor, out=output_tensor)
+"""
+Binary ops
+"""
+class AddOp(torch.nn.Module):
+    def forward(self, input_tensor_a, input_tensor_b, input_tensor_c):
+        torch.add(input_tensor_a, input_tensor_b, out=input_tensor_c)
+class MulOp(torch.nn.Module):
+    def forward(self, input_tensor_a, input_tensor_b, input_tensor_c):
+        torch.mul(input_tensor_a, input_tensor_b, out=input_tensor_c)
+class SubOp(torch.nn.Module):
+    def forward(self, input_tensor_a, input_tensor_b, input_tensor_c):
+        torch.sub(input_tensor_a, input_tensor_b, out=input_tensor_c)
+class DivOp(torch.nn.Module):
+    def forward(self, input_tensor_a, input_tensor_b, input_tensor_c):
+        torch.div(input_tensor_a, input_tensor_b, out=input_tensor_c)
+"""
+reduction ops
+"""
+class LayerNormOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor, weight_tensor):
+        output_tensor = torch.nn.functional.layer_norm(input_tensor, (input_tensor.shape[-1],), weight_tensor)
+class SoftmaxOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        output_tensor = torch.nn.functional.softmax(input_tensor, dim=-1, dtype=output_tensor.dtype)
+class ReduceSumOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor):
+        torch.sum(input_tensor, dim=-1, keepdim=True, dtype=output_tensor.dtype, out=output_tensor)
+class ReduceMinOp(torch.nn.Module):
+    def forward(self, input_tensor, value_tensor, indice_tensor):
+        torch.min(input_tensor, dim=-1, keepdim=True, out=(value_tensor, indice_tensor))
+class ReduceMaxOp(torch.nn.Module):
+    def forward(self, input_tensor, value_tensor, indice_tensor):
+        torch.max(input_tensor, dim=-1, keepdim=True, out=(value_tensor, indice_tensor))
+"""
+index_ops
+"""
+class IndexAddOp(torch.nn.Module):
+    def forward(self, dst_tensor, src_tensor, index_tensor):
+        dst_tensor.index_add_(0, index_tensor, src_tensor)
+class SortOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor, indice_tensor):
+        torch.sort(input_tensor, dim=-1, out=(output_tensor, indice_tensor))
+class UniqueOp(torch.nn.Module):
+    def forward(self, input_tensor, output_tensor, count_tensor):
+        output_tensor, count_tensor = torch.unique(
+            input=input_tensor, 
+            sorted=False, 
+            return_counts=True, 
+            return_inverse=False
+        )
+class ScatterOp(torch.nn.Module):
+    def forward(self, dst_tensor, src_tensor, index_tensor):
+        dst_tensor.scatter_(0, index_tensor, src_tensor)
+class GatherOp(torch.nn.Module):
+    def forward(self, dst_tensor, src_tensor, index_tensor):
+        torch.gather(src_tensor, 0, index_tensor, out=dst_tensor)
+"""
+h2d_ops
+"""
+class Host2DeviceOp(torch.nn.Module):
+    def forward(self, host_tensor, device_tensor):
+        device_tensor.copy_(host_tensor)
+class Device2HostOp(torch.nn.Module):
+    def forward(self, host_tensor, device_tensor):
+        host_tensor.copy_(device_tensor)
+"""
+communication ops
+"""
+class AllReduceOp(torch.nn.Module):
+    def forward(self, input_tensor):
+        dist.all_reduce(input_tensor, op=dist.ReduceOp.SUM)
+class AllGatherOp(torch.nn.Module):
+    def forward(self, input_tensors):
+        dist.all_gather(input_tensors, input_tensors[dist.get_rank()])
+class ReduceScatterOp(torch.nn.Module):
+    def forward(self, input_tensors):
+        dist.reduce_scatter(input_tensors[dist.get_rank()], input_tensors)
+class AllToAllOp(torch.nn.Module):
+    def forward(self, input_tensors, output_tensors):
+        dist.all_to_all(output_tensors, input_tensors)
+class BroadcastOp(torch.nn.Module):
+    def forward(self, input_tensor):
+        dist.broadcast(input_tensor, 0)
+class P2POp(torch.nn.Module):
+    def forward(self, send_tensor, recv_tensor):
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        reqs = []
+        if rank != world_size - 1:
+            reqs.append(dist.isend(send_tensor, (rank + 1) % world_size))
+        if rank != 0:
+            reqs.append(dist.irecv(recv_tensor, (rank - 1 + world_size) % world_size))
+        for req in reqs:
+            req.wait()
+op_registry = {
+    # gemm ops
+    "gemm": GemmOp(), 
+    "gemv": GemmOp(),
+    "batch_gemm": BatchGemmOp(),
+    "group_gemm": GroupGemmOp(),
+    # unary ops
+    "sin": SinOp(),
+    "cos": CosOp(),
+    "exp": ExpOp(),
+    "exponential": ExponentialOp(),
+    "log": LogOp(),
+    "sqrt": SqrtOp(),
+    "cast": CastOp(),
+    "silu": SiluOp(),
+    "gelu": GeluOp(),
+    "swiglu": SwiGLUOp(),
+    # binary ops
+    "add": AddOp(),
+    "sub": SubOp(),
+    "mul": MulOp(),
+    "div": DivOp(),
+    # reduction ops
+    "layernorm": LayerNormOp(),
+    "softmax": SoftmaxOp(),
+    "reduce_sum": ReduceSumOp(),
+    "reduce_max": ReduceMaxOp(),
+    "reduce_min": ReduceMinOp(),
+    # index_ops
+    "index_add": IndexAddOp(),
+    "sort": SortOp(),
+    "unique": UniqueOp(),
+    "scatter": ScatterOp(),
+    "gather": GatherOp(),
+    # h2d_ops
+    "device2host": Device2HostOp(),
+    "host2device": Host2DeviceOp(),
+    # ccl ops
+    "broadcast": BroadcastOp(),
+    "allreduce": AllReduceOp(),
+    "allgather": AllGatherOp(),
+    "alltoall": AllToAllOp(),
+    "reducescatter": ReduceScatterOp(),
+    "p2p": P2POp(),
+}
+op_compute_size_funcs = {
+    # gemm_ops
+    "gemm": gemm_compute_size,
+    "gemv": gemm_compute_size,
+    "batch_gemm": batch_gemm_compute_size,
+    "group_gemm": group_gemm_compute_size,
+    # unary_ops
+    "sin": sin_compute_size,
+    "cos": sin_compute_size,
+    "exp": sin_compute_size,
+    "exponential": sin_compute_size,
+    "log": sin_compute_size,
+    "sqrt": sin_compute_size,
+    "cast": cast_compute_size,
+    "silu": sin_compute_size,
+    "gelu": sin_compute_size,
+    "swiglu": swiglu_compute_size,
+    # binary_ops
+    "add": add_compute_size,
+    "mul": add_compute_size,
+    "sub": add_compute_size,
+    "div": add_compute_size,
+    # reduction_ops
+    "layernorm": layer_norm_compute_size,
+    "softmax": softmax_compute_size,
+    "reduce_sum": reduce_sum_compute_size,
+    "reduce_min": reduce_min_compute_size,
+    "reduce_max": reduce_min_compute_size,
+    # index_ops
+    "index_add": index_add_compute_size,
+    "sort": sort_compute_size,
+    "unique": unique_compute_size,
+    "scatter": scatter_compute_size,
+    "gather": scatter_compute_size,
+    # h2d_ops
+    "host2device": host2device_compute_size,
+    "device2host": host2device_compute_size,
+    # ccl_ops
+    "broadcast": host2device_compute_size,
+    "allreduce": host2device_compute_size,
+    "allgather": allgather_compute_size,
+    "alltoall": alltoall_compute_size,
+    "reducescatter": allgather_compute_size, 
+    "p2p": p2p_compute_size,
+}
+op_create_tensors_funcs = {
+    # gemm ops
+    "gemm": gemm_create_tensors,
+    "gemv": gemm_create_tensors,
+    "batch_gemm": batch_gemm_create_tensors,
+    "group_gemm": group_gemm_create_tensors,
+    # unary ops
+    "sin": sin_create_tensors,
+    "cos": sin_create_tensors,
+    "exp": sin_create_tensors,
+    "exponential": sin_create_tensors,
+    "log": sin_create_tensors,
+    "sqrt": sin_create_tensors,
+    "cast": cast_create_tensors,
+    "silu": sin_create_tensors,
+    "gelu": sin_create_tensors,
+    "swiglu": swiglu_create_tensors,
+    # binary ops
+    "add": add_create_tensors,
+    "mul": add_create_tensors,
+    "sub": add_create_tensors,
+    "div": add_create_tensors,
+    # reduction ops
+    "layernorm": layer_norm_create_tensors,
+    "softmax": softmax_create_tensors,
+    "reduce_sum": reduce_sum_create_tensors,
+    "reduce_min": reduce_min_create_tensors,
+    "reduce_max": reduce_min_create_tensors,
+    # index ops
+    "index_add": index_add_create_tensors,
+    "sort": sort_create_tensors,
+    "unique": unique_create_tensors,
+    "scatter": scatter_create_tensors,
+    "gather": scatter_create_tensors,
+    # h2d_ops
+    "host2device": host2device_create_tensors,
+    "device2host": host2device_create_tensors,
+    # ccl_ops
+    "broadcast": allreduce_create_tensors,
+    "allreduce": allreduce_create_tensors,
+    "allgather": allgather_create_tensors,
+    "alltoall": alltoall_create_tensors,
+    "reducescatter": allgather_create_tensors, 
+    "p2p": p2p_create_tensors,
+}
--- a/ByteMLPerf/byte_micro_perf/backends/utils.py
+++ b/ByteMLPerf/byte_micro_perf/backends/utils.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List
+import numpy as np
+import torch
+from backends import module_store
+def dump_communication_ops_report(
+    op_name: str,
+    torch_dtype,
+    input_shapes: List[List[int]],
+    compute_size_func, 
+    group_size: int,
+    bandwidth_limit: float,
+    latency: float,
+    error: str = ""
+):
+    # get dtype name and dtype_size
+    dtype_name = str(torch_dtype).split(".")[-1]
+    dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
+    element_num = math.prod(input_shapes[0])
+    tensor_size = dtype_size * element_num
+    mb = tensor_size / 1024 / 1024
+    if error == "":
+        algo_bw = tensor_size / latency / 1e3
+        """
+        allreduce:      2 * (group_size - 1) * (tensor_size / group_size)
+        allgather:      1 * (group_size - 1) * (tensor_size / group_size)
+        reducescatter:  1 * (group_size - 1) * (tensor_size / group_size)
+        alltoall:       1 * (group_size - 1) * (tensor_size / group_size)
+        broadcast:      tensor_size
+        p2p:            tensor_size
+        """
+        if op_name in ["allgather", "reducescatter", "alltoall"]:
+            bus_bw = algo_bw * (group_size - 1) / group_size
+        elif op_name in ["allreduce"]:
+            bus_bw = 2 * algo_bw * (group_size - 1) / group_size
+        elif op_name in ["broadcast", "p2p", "device2host", "host2device"]:
+            bus_bw = algo_bw
+        bandwidth_utils = None
+        if bandwidth_limit is not None:
+            bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
+        report = {
+            "Dtype": str(dtype_name),
+            "Tensor Shapes": input_shapes,
+            "Memory Size(MB)": round(mb, 2),
+            "Group": group_size,
+            "Kernel bandwidth(GB/s)": round(algo_bw, 2),
+            "Bus bandwidth(GB/s)": round(bus_bw, 2),
+            "Bandwidth Utilization(%)": bandwidth_utils,
+            "Avg latency(us)": round(latency, 2),
+        }
+    else:
+        report = {
+            "Dtype": str(dtype_name),
+            "Tensor Shapes": input_shapes,
+            "Memory Size(MB)": round(mb, 2),
+            "Group": group_size,
+            "Kernel bandwidth(GB/s)": 0,
+            "Bus bandwidth(GB/s)": 0,
+            "Bandwidth Utilization(%)": None,
+            "Avg latency(us)": 0,
+            "Error": error,
+        }
+    return report
+def dump_computation_ops_report(
+    op_name: str,
+    torch_dtype: str,
+    input_shapes: List[List[int]], 
+    compute_size_func, 
+    bandwidth_limit: float,
+    latency: float,
+    error: str = ""
+):
+    # get dtype name and dtype_size
+    dtype_name = str(torch_dtype).split(".")[-1]
+    batch_size, tensor_size, input_tensor_size, output_tensor_size = compute_size_func(input_shapes, torch_dtype)
+    if error == "":
+        qps = round(1e6 / latency * batch_size, 2)
+        algo_bw = tensor_size / latency / 1e3
+        bandwidth_utils = None
+        if bandwidth_limit is not None:
+            bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
+        report = {
+            "Dtype": str(dtype_name),
+            "Tensor Shapes": input_shapes,
+            "Read IO Size(MB)": round(input_tensor_size / 1024 / 1024, 2),
+            "Write IO Size(MB)": round(output_tensor_size / 1024 / 1024, 2),
+            "Memory Size(MB)": round(tensor_size / 1024 / 1024, 2),
+            "Kernel bandwidth(GB/s)": round(algo_bw, 2),
+            "Bandwidth Utilization(%)": bandwidth_utils,
+            "Avg latency(us)": round(latency, 2),
+            "QPS": qps,
+        }
+    else:
+        report = {
+            "Dtype": str(dtype_name),
+            "Tensor Shapes": input_shapes,
+            "Read IO Size(MB)": round(input_tensor_size / 1024 / 1024, 2),
+            "Write IO Size(MB)": round(output_tensor_size / 1024 / 1024, 2),
+            "Memory Size(MB)": round(tensor_size / 1024 / 1024, 2),
+            "Kernel bandwidth(GB/s)": 0,
+            "Bandwidth Utilization(%)": None,
+            "Avg latency(us)": 0,
+            "QPS": 0,
+            "Error": error,
+        }
+    return report
--- a/ByteMLPerf/byte_micro_perf/core/perf_engine.py
+++ b/ByteMLPerf/byte_micro_perf/core/perf_engine.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import json
+import time
+import datetime
+import signal
+import argparse
+import importlib
+import logging
+import subprocess
+import pathlib
+import traceback
+import random
+from typing import Any, Dict, List
+import itertools
+from collections import namedtuple
+import torch.distributed
+import torch.multiprocessing as mp
+import virtualenv
+import torch
+# directory config
+CUR_DIR = pathlib.Path.cwd().absolute()
+FILE_DIR = pathlib.Path(__file__).parent.absolute()
+BYTE_MLPERF_ROOT = FILE_DIR.parent
+sys.path.insert(0, str(BYTE_MLPERF_ROOT))
+# logger config
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("PerfEngine")
+def get_args():
+    parser = argparse.ArgumentParser()
+    # hardware config
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/",
+    )
+    parser.add_argument(
+        "--vendor_path",
+        help="The hardware configs need to be loaded, refs to vendor_zoo/NVIDIA/A100-PCIe.json",
+    )
+    # task config
+    parser.add_argument(
+        "--task_dir",
+        default=str(BYTE_MLPERF_ROOT.joinpath("workloads")),
+        help="The direcotry of tasks going to be evaluted, e.g., set to workloads"
+    )
+    parser.add_argument(
+        "--task",
+        default="gemm",
+        help="The task going to be evaluted, refs to workloads/",
+    )
+    # feature control
+    parser.add_argument(
+        "--parallel", 
+        type=int, default=1, 
+        help="Run all tasks in parallel if available"
+    )
+    parser.add_argument(
+        "--activate_venv", 
+        action="store_true",
+        help="Enable virtual environment to run the task",
+    )
+    args = parser.parse_args()
+    return args
+def load_workload(task: str, task_dir: str) -> Dict[str, Any]:
+    """
+    Return a list of dictionary with model Configuration
+    Args: List[str]
+    Returns: List[dic]
+    """
+    modules_dir = pathlib.Path(task_dir).absolute()
+    # create empty workload json data
+    workload_dict = {}
+    for file in modules_dir.iterdir():
+        if (
+            file.stem.startswith('_')
+            or file.stem.startswith('.')
+            or file.is_dir()
+            or file.suffix != '.json'
+            or file.stem != task
+        ):
+            continue
+        workload_dict = json.loads(file.read_text())
+    if not workload_dict:
+        logger.error(f"could not find {task}.json in {modules_dir}.")
+        exit(1)
+    return workload_dict
+def parse_workload(workload):
+    shape_list = []
+    if "input_shape_groups" in workload:
+        input_shape_groups = workload["input_shape_groups"] if isinstance(workload["input_shape_groups"], list) else [workload["input_shape_groups"]]
+        for input_shape_group in input_shape_groups:
+            if "inputs" in input_shape_group:
+                input_shape_list = []
+                for input_shapes in input_shape_group["inputs"]:
+                    input_shape_list.append([list(shape) for shape in itertools.product(*input_shapes)])
+                if len(input_shape_list) == 1:
+                    shape_list.extend(input_shape_list[0])
+                else:
+                    shape_list.extend([list(input_shape) for input_shape in zip(*input_shape_list)])
+            else:
+                gemm_keys = ["M", "K", "N", "MN", "MK", "KN"]
+                gemm_values = [input_shape_group.get(k, []) for k in gemm_keys]
+                if any(gemm_values):
+                    m ,k, n, mn, mk, kn = gemm_values
+                    # batch gemm
+                    if "batch_size" in input_shape_group:
+                        bs = input_shape_group.get("batch_size", [])
+                        if m and n and k:
+                            for p in itertools.product(bs, m, k, n):
+                                shape_list.append([[p[0], p[1], p[2]], [p[0], p[2], p[3]]])
+                        if mn and k:
+                            for p in itertools.product(bs, mn, k):
+                                shape_list.append([[p[0], p[1][0], p[2]], [p[0], p[2], p[1][1]]])
+                        if mk and n:
+                            for p in itertools.product(bs, mk, n):
+                                shape_list.append([[p[0], p[1][0], p[1][1]], [p[0], p[1][1], p[2]]])
+                        if m and kn:
+                            for p in itertools.product(bs, m, kn):
+                                shape_list.append([[p[0], p[1], p[2][0]], [p[0], p[2][0], p[2][1]]])
+                    # group gemm
+                    elif "gemm_group" in input_shape_group:
+                        groups = input_shape_group.get("gemm_group", [])
+                        batches = input_shape_group.get("batch", [])
+                        kn = input_shape_group.get("KN", [])
+                        if k and n:
+                            kn.append([list(shape) for shape in itertools.product(k, n)])
+                        for batch in batches:
+                            for _kn in kn:
+                                group_input_shape_list = []
+                                for group in groups:
+                                    group_input_shape_list.append([[group * batch, _kn[0]], [_kn[0], _kn[1]]])
+                                shape_list.append(group_input_shape_list)
+                    # gemm
+                    else:
+                        if m and n and k:
+                            for p in itertools.product(m, k, n):
+                                shape_list.append([[p[0], p[1]], [p[1], p[2]]])
+                        if mn and k:
+                            for p in itertools.product(mn, k):
+                                shape_list.append([[p[0][0], p[1]], [p[1], p[0][1]]])
+                        if mk and n:
+                            for p in itertools.product(mk, n):
+                                shape_list.append([[p[0][0], p[0][1]], [p[0][1], p[1]]])
+                        if m and kn:
+                            for p in itertools.product(m, kn):
+                                shape_list.append([[p[0], p[1][0]], [p[1][0], p[1][1]]])
+    return shape_list
+ConfigInstance = namedtuple("ConfigInstance", ["dtype", "tensor_shapes", "index", "total"])
+ResultItem = namedtuple("ResultItem", ["config", "report"])
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task, self.args.task_dir)
+        self.backend_type = self.args.hardware_type
+        self.old_os_path = os.environ["PATH"]
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.version = self.get_version()
+    def get_version(self):
+        version = ""
+        try:
+            version_file = os.path.join(str(BYTE_MLPERF_ROOT), "../VERSION")
+            with open(version_file) as f:
+                _version = f.read().splitlines()
+            version = '.'.join(v.split('=')[1] for v in _version)
+        except Exception as e:
+            traceback.print_exc()
+            logger.warning(f"get bytemlperf version failed, error msg: {e}")
+        return version
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+    def start_engine(self) -> None:
+        if self.args.activate_venv:
+            self.activate_venv(self.backend_type)
+        # init backend
+        hardware_type = self.backend_type
+        logger.info("Loading Heterogeneous Backend: {}".format(hardware_type))
+        backend_module = importlib.import_module(
+            "backends." + hardware_type + ".backend_" + hardware_type.lower())
+        self.backend_class = getattr(backend_module, "Backend" + hardware_type)
+        self.backend = self.backend_class(self.workload, self.args.vendor_path)
+        # create output dir based on task
+        # {BYTEMLPERF_ROOT}/byte_micro_perf/reports/{backend_type}/{task_name}
+        hardware_reports_dir = BYTE_MLPERF_ROOT.joinpath(
+            "reports", self.backend_type
+        )
+        output_dir = BYTE_MLPERF_ROOT.joinpath(
+            "reports", self.backend_type, 
+            self.workload["operator"]
+        )
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # get input shape info
+        target_group_list = self.workload.get("group", [1])
+        target_group_list.sort()
+        device_count = getattr(self.backend, "get_device_count")()
+        group_list = []
+        for group in target_group_list:
+            if group <= device_count:
+                group_list.append(group)
+            else:
+                break
+        dtype_list = self.workload.get("dtype", ["float32"])
+        shape_list = parse_workload(self.workload)
+        if not group_list or not dtype_list or not shape_list:
+            logger.error("empty group/dtype/shape")
+            exit(1)
+        test_list = []
+        case_index = 0
+        for dtype in dtype_list:
+            for shape in shape_list:
+                test_list.append(ConfigInstance(dtype, shape, case_index + 1, len(dtype_list) * len(shape_list)))
+                case_index = case_index + 1
+        try:
+            mp.set_start_method("spawn", force=True)
+        except Exception as e:
+            traceback.print_exc()
+            logger.error(f"Set start method failed, error msg: {e}")
+        # terminate subprocesses
+        subprocess_pids = []
+        def signal_handler(signum, frame):
+            logger.info(f"Received signal {signum}, exiting...")
+            if subprocess_pids:
+                for pid in subprocess_pids:
+                    logger.info(f"terminate subprocess: {pid}")
+                    os.kill(pid, signal.SIGTERM)
+            sys.exit(0)
+        signal.signal(signal.SIGINT, signal_handler)
+        signal.signal(signal.SIGTERM, signal_handler)
+        # all operations will enter subprocess to test in parallel
+        for group in group_list:
+            logger.info(f"Start to test group size: {group}")       
+            instance_num = min(device_count, max(1, self.args.parallel)) if group == 1 else group
+            if self.workload["operator"] in ["device2host", "host2device"]:
+                instance_num = 1
+            input_queues = mp.Queue()
+            output_queues = mp.Queue(maxsize=1)
+            try:
+                _subprocesses = mp.spawn(
+                    fn=self.perf_func, 
+                    args=(instance_num, group, output_dir, test_list, input_queues, output_queues), 
+                    nprocs=instance_num, 
+                    join=False, 
+                    daemon=False
+                )
+                subprocess_pids = _subprocesses.pids()
+                for _ in range(instance_num):
+                    assert "ready" == output_queues.get()
+                logger.info("all ranks are ready and listening, init done")
+                start_time = time.perf_counter_ns()
+                if group == 1:
+                    for test_instance in test_list:
+                        input_queues.put(test_instance, False)
+                    for _ in range(instance_num):
+                        input_queues.put(None, False)
+                result_list = []
+                if group == 1:
+                    for _ in range(instance_num):
+                        result_list.extend(output_queues.get())
+                elif group > 1:
+                    result_list.extend(output_queues.get())
+                result_list = sorted(result_list, key=lambda x: x.config.index)
+                dtype_results_mapping = {}
+                for result in result_list:
+                    if result.config.dtype not in dtype_results_mapping:
+                        dtype_results_mapping[result.config.dtype] = []
+                    dtype_results_mapping[result.config.dtype].append(result)
+                for dtype, results in dtype_results_mapping.items():
+                    dtype_results_mapping[dtype] = sorted(results, key=lambda x: x.config.index)
+                    base_report = {
+                        "Operator": self.workload["operator"].upper(),
+                        "Backend": self.backend_type,
+                        "Host Info": self.get_cpu_name(),
+                        "Device Info": getattr(self.backend, "get_device_name")(),
+                        "Version": self.version,
+                        "Execution Date": time.strftime("%Y-%m-%d %H:%M:%S"),
+                        "Performance": [result.report for result in dtype_results_mapping[dtype]]
+                    }
+                    filename = (
+                        f"result-{str(dtype)}"
+                        + (
+                            f"-group{group}"
+                            if group > 1
+                            else ""
+                        )
+                        + ".json"
+                    )
+                    filepath = output_dir.joinpath(filename)
+                    with open(filepath, "w") as f:
+                        json.dump(base_report, f, indent=4)
+                for process in _subprocesses.processes:
+                    process.join()
+                end_time = time.perf_counter_ns()
+                duration = (end_time - start_time) / 1e9
+                duration = round(duration, 3)
+                current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                ret_code = 0
+                for process in _subprocesses.processes:
+                    if process.exitcode != 0:
+                        ret_code = process.exitcode
+                        break
+                if ret_code != 0:
+                    with open(f"{hardware_reports_dir}/_run_report.log", "a") as f:
+                        print(f"[failed] {self.args.task}, group_size={group}, {current_time}, {duration} s", file=f)
+                else:
+                    with open(f"{hardware_reports_dir}/_run_report.log", "a") as f:
+                        print(f"[success] {self.args.task}, group_size={group}, {current_time}, {duration} s", file=f)
+            except Exception as e:
+                traceback.print_exc()
+                logger.error(f"Execute task: {self.args.task} failed, group: {group}, error msg: {e}")
+                current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                with open(f"{hardware_reports_dir}/_run_report.log", "a") as f:
+                    print(f"[error] {self.args.task}, group_size={group}, {current_time}", file=f)
+            subprocess_pids = []
+            time.sleep(1)
+        if self.args.activate_venv:
+            self.deactivate_venv()
+    def perf_func(self, rank: int, *args):
+        world_size, group_size, output_dir, test_list, input_queues, output_queues = args
+        backend_instance = self.backend_class(self.workload, self.args.vendor_path)
+        backend_instance.rank = rank
+        backend_instance.world_size = world_size
+        backend_instance.set_device(rank)
+        if group_size > 1:
+            backend_instance.initialize_ccl(rank, world_size)
+        op_name = self.workload["operator"]
+        backend_instance.get_op_instance()
+        output_queues.put("ready")
+        result_list = []
+        if group_size == 1:
+            while True:
+                test_instance = input_queues.get()
+                if test_instance is None:
+                    break           
+                test_dtype = test_instance.dtype
+                test_shape = test_instance.tensor_shapes
+                """
+                input_shape could be:
+                    List[int]: single shape. cos
+                    List[List[int]]: multiple inputs. add
+                    List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
+                """
+                if isinstance(test_shape[0], int):
+                    test_shape = [test_shape]
+                try:
+                    reports = backend_instance.perf(test_shape, test_dtype)
+                except Exception as e:
+                    traceback.print_exc()
+                    logger.error(f"Execute op: {op_name.lower()} failed, input_shape: {test_shape}, dtype: {test_dtype}, error msg: {e}")
+                    reports = {}
+                if reports and "Error" not in reports:
+                    result_list.append(ResultItem(test_instance, reports))
+                    latency = reports.get("Avg latency(us)", 0)
+                    kernel_bw = reports.get("Kernel bandwidth(GB/s)", 0)
+                    bus_bw = reports.get("Bus bandwidth(GB/s)", 0)
+                    print(f"rank {rank}, {test_instance}, latency: {latency}\nkernel_bw: {kernel_bw}, bus_bw: {bus_bw}")
+                else:
+                    print(f"rank {rank}, {test_instance}, error")
+            output_queues.put(result_list)
+        elif group_size > 1:
+            for test_instance in test_list:
+                test_dtype = test_instance.dtype
+                test_shape = test_instance.tensor_shapes
+                """
+                input_shape could be:
+                    List[int]: single shape. cos
+                    List[List[int]]: multiple inputs. add
+                    List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
+                """
+                if isinstance(test_shape[0], int):
+                    test_shape = [test_shape]
+                try:
+                    reports = backend_instance.perf(test_shape, test_dtype)
+                except Exception as e:
+                    traceback.print_exc()
+                    logger.error(f"Execute op: {op_name.lower()} failed, input_shape: {test_shape}, dtype: {test_dtype}, error msg: {e}")
+                    reports = {}
+                if reports and "Error" not in reports:
+                    result_list.append(ResultItem(test_instance, reports))
+                    latency = reports.get("Avg latency(us)", 0)
+                    kernel_bw = reports.get("Kernel bandwidth(GB/s)", 0)
+                    bus_bw = reports.get("Bus bandwidth(GB/s)", 0)
+                    if rank == 0:
+                        print(f"rank {rank}, {test_instance}, latency: {latency}\nkernel_bw: {kernel_bw}, bus_bw: {bus_bw}")
+                else:
+                    if rank == 0:
+                        print(f"rank {rank}, {test_instance}, error")
+            if rank == 0:
+                output_queues.put(result_list)
+        if group_size > 1:
+            backend_instance.destroy_process_group()
+    def activate_venv(self, hardware_type: str) -> bool:
+        if os.path.exists("backends/" + hardware_type + "/requirements.txt"):
+            logger.info("Activating Virtual Env for " + hardware_type)
+            venv_dir = os.path.join("backends", hardware_type + "/venv")
+            activate_file = os.path.join(venv_dir, "bin", "activate_this.py")
+            if not os.path.exists(venv_dir):
+                logger.info("venv not exist, Creating Virtual Env for " + hardware_type)
+                virtualenv.create_environment(venv_dir, True)
+                exec(open(activate_file).read(), {"__file__": activate_file})
+                python_path = os.path.join(venv_dir, "bin", "python3")
+                subprocess.call(
+                    [python_path, "-m", "pip", "install", "--upgrade", "pip", "--quiet"]
+                )
+                subprocess.call(
+                    [
+                        python_path,
+                        "-m",
+                        "pip",
+                        "install",
+                        "-r",
+                        "backends/" + hardware_type + "/requirements.txt",
+                        "-q",
+                    ]
+                )
+            else:
+                exec(open(activate_file).read(), {"__file__": activate_file})
+                """
+                just in case install failed in pre-run.
+                """
+                python_path = os.path.join(venv_dir, "bin", "python3")
+                subprocess.call(
+                    [python_path, "-m", "pip", "install", "--upgrade", "pip", "--quiet"]
+                )
+                subprocess.call(
+                    [
+                        python_path,
+                        "-m",
+                        "pip",
+                        "install",
+                        "-r",
+                        "backends/" + hardware_type + "/requirements.txt",
+                        "-q",
+                    ]
+                )
+                if not hasattr(sys, "real_prefix"):
+                    return False
+                return True
+        return True
+    def deactivate_venv(self):
+        sys.path[:0] = self.prev_sys_path  # will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ["PATH"] = self.old_os_path
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
--- a/ByteMLPerf/byte_micro_perf/launch.py
+++ b/ByteMLPerf/byte_micro_perf/launch.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import argparse
+import pathlib
+import logging
+import subprocess
+import signal
+# directory config
+CUR_DIR = pathlib.Path.cwd().absolute()
+FILE_DIR = pathlib.Path(__file__).parent.absolute()
+BYTE_MLPERF_ROOT = FILE_DIR
+sys.path.insert(0, str(BYTE_MLPERF_ROOT))
+# logger config
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("lanuch")
+def parse_task(task_dir):
+    tasks = []
+    if os.path.isdir(task_dir):
+        for root, _, files in os.walk(task_dir, topdown=False):
+            for name in files:
+                if name.endswith(".json"):
+                    tasks.append(name.rsplit('.', 1)[0])
+    return tasks
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # hardware config
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU", 
+        help="The backend going to be evaluted, refs to backends/",
+    )
+    parser.add_argument(
+        "--vendor_path",
+        help="The hardware configs need to be loaded, refs to vendor_zoo/",
+    )
+    # task config
+    parser.add_argument(
+        "--task_dir", 
+        default=str(BYTE_MLPERF_ROOT.joinpath("workloads").absolute()), 
+        help="The direcotry of tasks going to be evaluted, e.g., set to workloads"
+    )
+    parser.add_argument(
+        "--task", 
+        default="all", 
+        help="The task going to be evaluted, refs to workloads/, default use all tasks in workloads/"
+    )
+    # list all supported task and hardware
+    parser.add_argument(
+        "--show_task_list", 
+        action="store_true", 
+        help="Print all available task names"
+    )
+    parser.add_argument(
+        "--show_hardware_list",
+        action="store_true",
+        help="Print all hardware bytemlperf supported",
+    )
+    # feature control
+    parser.add_argument(
+        "--parallel", 
+        type=int, default=1, 
+        help="Run all tasks in parallel if available"
+    )
+    parser.add_argument(
+        "--install_requirements", action="store_true", 
+        help="Install all required packages"
+    )
+    parser.add_argument(
+        "--activate_venv", action="store_true",
+        help="Enable python virtual environment"
+    )
+    args = parser.parse_args()
+    args.vendor_path = pathlib.Path(args.vendor_path).absolute() if args.vendor_path else None
+    args.task_dir = pathlib.Path(args.task_dir).absolute()
+    os.chdir(str(BYTE_MLPERF_ROOT))
+    # show tasks
+    task_list = [file.stem for file in args.task_dir.iterdir()]
+    task_list.sort()
+    task_mapping = {
+        "all": task_list, 
+        "gemm_ops": [], 
+        "unary_ops": [], 
+        "binary_ops": [], 
+        "reduction_ops": [], 
+        "index_ops": [], 
+        "h2d_ops": [], 
+        "ccl_ops": []
+    }
+    for task in task_list:
+        if task in ["gemm", "gemv", "batch_gemm", "group_gemm"]:
+            task_mapping["gemm_ops"].append(task)
+        if task in ["sin", "cos", "exp", "exponential", "log", "sqrt", "cast", "silu", "gelu", "swiglu"]:
+            task_mapping["unary_ops"].append(task)
+        if task in ["add", "mul", "sub", "div"]:
+            task_mapping["binary_ops"].append(task)
+        if task in ["layernorm", "softmax", "reduce_sum", "reduce_max", "reduce_min"]:
+            task_mapping["reduction_ops"].append(task)
+        if task in ["index_add", "sort", "unique", "gather", "scatter"]:
+            task_mapping["index_ops"].append(task)
+        if task in ["host2device", "device2host", "device2device"]:
+            task_mapping["h2d_ops"].append(task)
+        if task in ["allgather", "allreduce", "alltoall", "broadcast", "p2p", "reduce_scatter"]:
+            task_mapping["ccl_ops"].append(task)
+    if args.show_task_list:
+        logger.info("******************* Supported Task *******************")
+        print(task_list)        
+        exit(0)
+    # show hardwares
+    hardware_list = []
+    for file in BYTE_MLPERF_ROOT.joinpath("backends").iterdir():
+        if file.is_dir() and file.stem.startswith("_") is False:
+            hardware_list.append(file.stem)
+    if args.show_hardware_list:
+        logger.info("***************** Supported Hardware Backend *****************")
+        print(hardware_list)
+        exit(0)
+    # check task
+    test_cases = []
+    if args.task in task_mapping.keys():
+        test_cases = task_mapping[args.task]
+    else:
+        specified_tasks = args.task.split(",")
+        for task in specified_tasks:
+            if task not in task_list:
+                logger.error(f"Task {task} not found in {args.task_dir}")
+                exit(1)
+            test_cases.append(task)
+    logger.info(f"******************* Tasks: *****************")
+    logger.info(f"{test_cases}\n")
+    # check hardware
+    hardware = args.hardware_type
+    if hardware not in hardware_list:
+        logger.error(f"Hardware {hardware} not found in {BYTE_MLPERF_ROOT.joinpath('backends')}")
+        exit(1)
+    logger.info(f"******************* hardware: *****************")
+    logger.info(f"{hardware}\n")
+    if args.install_requirements:
+        logger.info("******************* Pip Package Installing *******************")
+        subprocess.run(
+            ["python3", "-m", "pip", "install", "pip", "--upgrade", "--quiet"]
+        )
+        subprocess.run(
+            ["python3", "-m", "pip", "install", "-r", "requirements.txt", "--quiet"]
+        )
+        if not args.activate_venv:
+            subprocess.run(
+                ["python3", "-m", "pip", "install", "-r", f"backends/{hardware}/requirements.txt", "--quiet"]
+            )
+    outputs_dir = pathlib.Path(BYTE_MLPERF_ROOT).joinpath("reports", args.hardware_type)
+    if not outputs_dir.exists():
+        outputs_dir.mkdir(parents=True)
+    with open(f"{BYTE_MLPERF_ROOT}/reports/{args.hardware_type}/_run_report.log", "w") as file:
+        pass
+    # terminate task perf process
+    subprocess_pid = -1
+    def signal_handler(signum, frame):
+        logger.info(f"Received signal {signum}, exiting...")
+        if subprocess_pid != -1:
+            logger.info(f"terminate subprocess: {subprocess_pid}")
+            os.kill(subprocess_pid, signal.SIGTERM)
+        sys.exit(0)
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+    failed_ops = []
+    for task in test_cases:
+        cmds = [
+            "python3", 
+            "./core/perf_engine.py", 
+            "--hardware_type", args.hardware_type,
+            "--vendor_path", str(args.vendor_path),
+            "--task", task,
+            "--task_dir", str(args.task_dir), 
+            "--parallel", str(args.parallel)
+        ]
+        if args.activate_venv:
+            cmds.append("--activate_venv")
+        print(f"******************************************* Start to test op: [{task}]. *******************************************")
+        process = subprocess.Popen(cmds)
+        subprocess_pid = process.pid
+        ret = process.wait()
+        if ret != 0:
+            failed_ops.append(task)
+        print("")
+    if failed_ops:
+        logger.error(f"Failed ops: {failed_ops}")
+        exit(1)
+    else:
+        logger.info("All ops passed")
--- a/ByteMLPerf/byte_micro_perf/requirements.txt
+++ b/ByteMLPerf/byte_micro_perf/requirements.txt
+matplotlib
+pandas
+virtualenv==16.7.12
+scikit-learn
+prompt_toolkit
+tqdm
+opencv-python
+transformers
+tokenization
+fpdf
+attrs
+decorator
+typing-extensions
+pydot
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/run.sh
+++ b/ByteMLPerf/byte_micro_perf/run.sh
+source /home/workspace/dtk-24.04.3/env.sh
+python3 ./launch.py --parallel 8
--- a/ByteMLPerf/byte_micro_perf/scripts/convert.py
+++ b/ByteMLPerf/byte_micro_perf/scripts/convert.py
+import sys
+import csv
+import json
+import pathlib
+import argparse
+import logging
+CUR_DIR = pathlib.Path(__file__).parent.absolute()
+PRJ_ROOT_DIR = CUR_DIR.parent
+sys.path.insert(0, str(PRJ_ROOT_DIR))
+unique_attrs = [
+    "op_name",
+    "sku_name",
+    "owner",
+    "perf_mode"
+]
+def get_unique_key(
+    op_name, 
+    sku_name, 
+    owner, 
+    perf_mode, 
+    *args,
+    **kwargs
+):
+    return ".".join([
+        sku_name,
+        owner,
+        op_name,
+        perf_mode
+    ]).replace(" ", "_")
+arguments_map = {
+    # 单目算子
+    # [batch, len] --> [batch, len]
+    "sin": ["dtype", "batch", "len"], 
+    "cos": ["dtype", "batch", "len"],
+    "exp": ["dtype", "batch", "len"],
+    "exponential": ["dtype", "batch", "len"], 
+    "silu": ["dtype", "batch", "len"],
+    "gelu": ["dtype", "batch", "len"],
+    "swiglu": ["dtype", "batch", "len"],
+    # float32: float32 --> float16/bfloat16
+    # float16: float16 --> float32
+    # bfloat16: bfloat16 --> float32
+    "cast": ["dtype", "batch", "len"],
+    # 双目算子
+    # [batch, len] (op) [batch, len] --> [batch, len]
+    "add": ["dtype", "batch", "len"], 
+    "mul": ["dtype", "batch", "len"], 
+    "sub": ["dtype", "batch", "len"], 
+    "div": ["dtype", "batch", "len"], 
+    # 规约算子
+    # [batch, len] --> [batch, len]
+    "layernorm": ["dtype", "batch", "len"], 
+    "softmax": ["dtype", "batch", "len"],
+    # [batch, len] --> [batch, 1]
+    "reduce_sum": ["dtype", "batch", "len"],
+    "reduce_min": ["dtype", "batch", "len"],
+    "reduce_max": ["dtype", "batch", "len"],
+    # 索引算子
+    # [batch, len] (op) [batch] --> [batch, len]
+    "index_add": ["dtype", "batch", "len"],
+    # [batch, len] --> [batch, len]
+    "sort": ["dtype", "batch", "len"], 
+    "unique": ["dtype", "batch", "len"], 
+    "gather": ["dtype", "batch", "len"],
+    "scatter": ["dtype", "batch", "len"],
+    # 矩阵算子
+    # [M, K] * [K, N] --> [M, N]
+    "gemm": ["dtype", "M", "N", "K"], 
+    # [batch, M, K] * [batch, K, N] --> [batch, M, N]
+    "batch_gemm": ["dtype", "batch", "M", "N", "K"],
+    # # group * {[M, K] * [K, N] = [M, N]
+    "group_gemm": ["dtype", "batch", "group", "M_str", "N", "K"], 
+    # 通信算子    
+    # [batch, len] --> [batch, len]
+    # tp_size split over batch
+    "broadcast": ["dtype", "tp_size", "batch", "len"], 
+    "allreduce": ["dtype", "tp_size", "batch", "len"], 
+    "allgather": ["dtype", "tp_size", "batch", "len"], 
+    "alltoall": ["dtype", "tp_size", "batch", "len"], 
+    "reducescatter": ["dtype", "tp_size", "batch", "len"], 
+    "p2p": ["dtype", "tp_size", "batch", "len"], 
+    "device2host": ["dtype", "batch", "len"],
+    "host2device": ["dtype", "batch", "len"]
+}
+target_attrs = [
+    # latency in us
+    "latency"
+]
+def get_csv_headers(op_name):
+    return unique_attrs + arguments_map.get(op_name, []) + target_attrs
+logger = logging.getLogger("bytemlperf_aeolus")
+def setup_logger(loglevel: str):
+    fmt = logging.Formatter(
+        fmt="%(asctime)s.%(msecs)03d %(filename)s:%(lineno)d [%(levelname)s]: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    handler = logging.StreamHandler(stream=sys.stdout)
+    handler.setFormatter(fmt)
+    logger.addHandler(handler)
+    logger.setLevel(loglevel.upper())
+    logger.propagate = False
+sku_name_mapping = {
+    "MLU590-M9": "MLU590 M9",
+    "MLU590-M9D": "MLU590 M9D",
+    "MLU590-M9DK": "MLU590 M9D",
+    "Iluvatar BI-V150": "BI-V150",
+    "NVIDIA A800-SXM4-80GB": "A800 80GB SXM", 
+    "NVIDIA H800": "H800 80GB SXM", 
+    "NVIDIA H20": "H20 96GB SXM", 
+    "Ascend910B2C": "Ascend910B2"
+}
+dtype_map = {
+    "float": "float32", 
+    "half": "float16", 
+    "int": "int32"
+}
+def normal_ops_func(op, sku_name, frame, perf_mode, json_data):
+    if not json_data or "Error" in json_data:
+        return
+    dtype = json_data["Dtype"]
+    if dtype in dtype_map:
+        dtype = dtype_map[dtype]
+    batch = json_data["Tensor Shapes"][0][0]
+    len = json_data["Tensor Shapes"][0][1]
+    latency = json_data["Avg latency(us)"]
+    return [op, sku_name, frame, perf_mode, dtype, batch, len, latency]
+def gemm_func(op, sku_name, frame, perf_mode, json_data):
+    if not json_data or "Error" in json_data:
+        return
+    dtype = json_data["Dtype"]
+    if dtype in dtype_map:
+        dtype = dtype_map[dtype]
+    M = json_data["Tensor Shapes"][0][0]
+    K = json_data["Tensor Shapes"][0][1]
+    N = json_data["Tensor Shapes"][1][1]
+    latency = json_data["Avg latency(us)"]
+    return [op, sku_name, frame, perf_mode, dtype, M, N, K, latency]
+def batch_gemm_func(op, sku_name, frame, perf_mode, json_data):
+    if not json_data or "Error" in json_data:
+        return
+    dtype = json_data["Dtype"]
+    if dtype in dtype_map:
+        dtype = dtype_map[dtype]
+    batch_size = json_data["Tensor Shapes"][0][0]
+    M = json_data["Tensor Shapes"][0][1]
+    K = json_data["Tensor Shapes"][0][2]
+    N = json_data["Tensor Shapes"][1][2]
+    latency = json_data["Avg latency(us)"]
+    return [op, sku_name, frame, perf_mode, dtype, batch_size, M, N, K, latency]
+def group_gemm_func(op, sku_name, frame, perf_mode, json_data):
+    if not json_data or "Error" in json_data:
+        return
+    dtype = json_data["Dtype"]
+    if dtype in dtype_map:
+        dtype = dtype_map[dtype]
+    batch_size = json_data["Tensor Shapes"][0][0][0]
+    group = len(json_data["Tensor Shapes"])
+    M_list = [int(json_data["Tensor Shapes"][i][0][0]) // batch_size for i in range(group)]
+    M_list_str = "/".join([str(m) for m in M_list])
+    K = json_data["Tensor Shapes"][0][0][1]
+    N = json_data["Tensor Shapes"][0][1][1]
+    latency = json_data["Avg latency(us)"]
+    return [op, sku_name, frame, perf_mode, dtype, batch_size, group, M_list_str,N, K, latency]
+def ccl_ops_func(op, sku_name, frame, perf_mode, json_data):
+    if not json_data or "Error" in json_data:
+        return
+    dtype = json_data["Dtype"]
+    if dtype in dtype_map:
+        dtype = dtype_map[dtype]
+    tp_size = json_data["Group"]
+    batch = json_data["Tensor Shapes"][0][0]
+    len = json_data["Tensor Shapes"][0][1]
+    latency = json_data["Avg latency(us)"]
+    return [op, sku_name, frame, perf_mode, dtype, tp_size, batch, len, latency]
+def d2h_h2d_func(op, sku_name, frame, perf_mode, json_data):
+    if not json_data or "Error" in json_data:
+        return
+    dtype = json_data["Dtype"]
+    if dtype in dtype_map:
+        dtype = dtype_map[dtype]
+    batch = json_data["Tensor Shapes"][0][0]
+    len = json_data["Tensor Shapes"][0][1]
+    latency = json_data["Avg latency(us)"]
+    return [op, sku_name, frame, perf_mode, dtype, batch, len, latency]
+post_func_map = {
+    "sin": normal_ops_func,
+    "cos": normal_ops_func,
+    "exp": normal_ops_func,
+    "exponential": normal_ops_func,
+    "silu": normal_ops_func,
+    "gelu": normal_ops_func,
+    "swiglu": normal_ops_func,
+    "cast": normal_ops_func,
+    "add": normal_ops_func,
+    "mul": normal_ops_func,
+    "sub": normal_ops_func,
+    "div": normal_ops_func,
+    "layernorm": normal_ops_func,
+    "softmax": normal_ops_func,
+    "reduce_sum": normal_ops_func,
+    "reduce_min": normal_ops_func,
+    "reduce_max": normal_ops_func,
+    "index_add": normal_ops_func,
+    "sort": normal_ops_func,
+    "unique": normal_ops_func,
+    "gather": normal_ops_func,
+    "scatter": normal_ops_func,
+    "gemm": gemm_func,
+    "batch_gemm": batch_gemm_func,
+    "group_gemm": group_gemm_func,
+    "broadcast": ccl_ops_func,
+    "allreduce": ccl_ops_func,
+    "allgather": ccl_ops_func,
+    "alltoall": ccl_ops_func,
+    "reducescatter": ccl_ops_func,
+    "p2p": ccl_ops_func,
+    "device2host": d2h_h2d_func,
+    "host2device": d2h_h2d_func
+}
+def postprocess(op, file_list, dst_dir):
+    json_data_list = [json.load(open(file)) for file in file_list]
+    if not json_data_list:
+        logger.error(f"no data found in {file_list}")
+        return
+    sku_name = json_data_list[0]["Device Info"]
+    sku_name = sku_name_mapping.get(sku_name, sku_name)
+    perf_datas = []
+    for json_data in json_data_list:
+        if "Performance" not in json_data:
+            logger.error(f"no performance data")
+            continue
+        perf_data = json_data["Performance"]
+        if not perf_datas:
+            perf_datas = perf_data
+        else:
+            perf_datas.extend(perf_data)
+    unique_name = get_unique_key(op, sku_name, "torch", "host")
+    unique_csv_file = f"{unique_name}.csv"
+    unique_csv_path = dst_dir / unique_csv_file
+    with open(unique_csv_path, "w") as f:
+        writer = csv.writer(f)
+        writer.writerow(get_csv_headers(op))
+        for perf_data in perf_datas:
+            if op in post_func_map:
+                row = post_func_map[op](op, sku_name, "torch", "host", perf_data)
+                if row:
+                  writer.writerow(row)
+def convert_src(src, dst):
+    logger.info(f"src: {src}")
+    logger.info(f"dst: {dst}")
+    op_data_map = {}
+    for file in src.rglob("*.json"):
+        dir_name = file.parent.name
+        if dir_name == "gemv":
+            dir_name = "gemm"
+        if not dir_name in op_data_map:
+            op_data_map[dir_name] = []
+        op_data_map[dir_name].append(file)
+    for op, files in op_data_map.items():
+        logger.info(f"op: {op}")
+        if op not in arguments_map and op != "gemv":
+            logger.error(f"invalid op: {op}")
+            continue
+        postprocess(op, files, dst)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--output_dir", type=str, default="./temp")
+    parser.add_argument("--log_level", type=str, default="INFO")
+    args = parser.parse_args()
+    setup_logger(args.log_level)
+    src_dir = pathlib.Path(args.src).absolute()
+    if not src_dir.exists():
+        logger.error(f"{args.src} does not exist")
+        exit(1)
+    elif not src_dir.is_dir():
+        logger.error(f"{args.src} is not a directory")
+        exit(1)
+    output_dir = pathlib.Path(args.output_dir).absolute()
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+    elif not output_dir.is_dir():
+        logger.error(f"{args.output_dir} is not a directory")
+        exit(1)
+    convert_src(src_dir, output_dir)
--- a/ByteMLPerf/byte_micro_perf/workloads/add.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/add.json
+{
+  "operator": "add",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ],
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/allgather.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/allgather.json
+{
+  "operator": "allgather",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
+        [1024]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ],
+  "group": [
+    2,
+    4,
+    8
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/allreduce.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/allreduce.json
+{
+  "operator": "allreduce",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
+        [1024]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ],
+  "group": [
+    2,
+    4,
+    8
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/alltoall.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/alltoall.json
+{
+  "operator": "alltoall",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
+        [1024]
+      ],
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
+        [1024]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ],
+  "group": [
+    2,
+    4,
+    8
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/batch_gemm.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/batch_gemm.json
+{
+  "operator": "batch_gemm", 
+  "iterations": 100, 
+  "input_shape_groups": {
+    "batch_size": [8, 12, 16, 20, 24, 28, 32],
+    "M": [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
+    "KN": [
+      [1024, 1024], 
+      [4096, 4096], 
+      [8192, 8192], 
+      [16384, 32], 
+      [16384, 128], 
+      [16384, 1024], 
+      [32, 16384], 
+      [128, 16384], 
+      [1024, 16384]]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16", 
+    "int8"
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/broadcast.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/broadcast.json
+{
+  "operator": "broadcast",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
+        [1024]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ],
+  "group": [
+    2,
+    4,
+    8
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/cast.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/cast.json
+{
+  "operator": "cast",
+  "iterations": 100,
+  "input_shape_groups": {
+  "inputs": [
+    [
+      [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+      [8192]
+    ]
+  ]
+},
+"dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/cos.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/cos.json
+{
+  "operator": "cos",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/device2host.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/device2host.json
+{
+  "operator": "device2host",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [1024]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/div.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/div.json
+{
+  "operator": "div",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ],
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/exp.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/exp.json
+{
+  "operator": "exp",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/exponential.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/exponential.json
+{
+  "operator": "exponential",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "float16"
+  ]
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_micro_perf/workloads/gather.json
+++ b/ByteMLPerf/byte_micro_perf/workloads/gather.json
+{
+    "operator": "gather", 
+    "iterations": 100, 
+    "input_shape_groups": {
+      "inputs": [
+        [
+          [1024], 
+          [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+        ]
+      ]
+    }, 
+    "dtype": [
+      "float32", 
+      "bfloat16", 
+      "float16"
+    ]
+  }
\ No newline at end of file