Commit 24b257f1 authored by sunzhq2's avatar sunzhq2
Browse files

init

parent 920b3c0f
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import random
import torch
import torch.distributed as dist
def gemm_compute_size(input_shapes, torch_dtype):
# input_shapes: [[M, K], [K, N]]
a_shape, b_shape = input_shapes
M, _ = a_shape
_, N = b_shape
d_shape = [M, N]
# get element_size and dtype_size
input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
output_element_num = sum([math.prod(shape) for shape in [d_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
if torch_dtype == torch.int8:
output_tensor_size = 4 * output_element_num
else:
output_tensor_size = dtype_size * output_element_num
batch_size = M
tensor_size = input_tensor_size + output_tensor_size
return (batch_size, tensor_size, input_tensor_size, output_tensor_size)
def gemm_create_tensors(input_shapes, torch_dtype, xpu_device):
# input_shapes: [[M, K], [K, N]]
a_shape, b_shape = input_shapes
M, _ = a_shape
_, N = b_shape
d_shape = [M, N]
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
b_tensor = torch.randint(0, 7, b_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
d_tensor = torch.randint(0, 7, d_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor, b_tensor, d_tensor]
def batch_gemm_compute_size(input_shapes, torch_dtype):
# input_shapes: [[bs, M, K], [bs, K, N]]
a_shape, b_shape = input_shapes
bs, M, _ = a_shape
bs, _, N = b_shape
d_shape = [bs, M, N]
# get element_size and dtype_size
input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
output_element_num = sum([math.prod(shape) for shape in [d_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
if torch_dtype == torch.int8:
output_tensor_size = 4 * output_element_num
else:
output_tensor_size = dtype_size * output_element_num
batch_size = bs
tensor_size = input_tensor_size + output_tensor_size
return (batch_size, tensor_size, input_tensor_size, output_tensor_size)
def batch_gemm_create_tensors(input_shapes, torch_dtype, xpu_device):
# input_shapes: [[bs, M, K], [bs, K, N]]
a_shape, b_shape = input_shapes
bs, M, _ = a_shape
bs, _, N = b_shape
d_shape = [bs, M, N]
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
b_tensor = torch.randint(0, 7, b_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
d_tensor = torch.randint(0, 7, d_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor, b_tensor, d_tensor]
def group_gemm_compute_size(input_shapes, torch_dtype):
"""
[
[[M1, K1], [K1, N1]],
[[M2, K2], [K2, N2]]
]
"""
input_tensor_size = 0
output_tensor_size = 0
for problem_shape in input_shapes:
a_shape, b_shape = problem_shape
M, _ = a_shape
_, N = b_shape
d_shape = [M, N]
# get element_size and dtype_size
input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
output_element_num = sum([math.prod(shape) for shape in [d_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size += dtype_size * input_element_num
if torch_dtype == torch.int8:
output_tensor_size += 4 * output_element_num
else:
output_tensor_size += dtype_size * output_element_num
batch_size = 1
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def group_gemm_create_tensors(input_shapes, torch_dtype, xpu_device):
"""
[
[[M1, K1], [K1, N1]],
[[M2, K2], [K2, N2]]
]
"""
left_tensors = []
right_tensors = []
output_tensors = []
for problem_shape in input_shapes:
a_shape, b_shape = problem_shape
M, _ = a_shape
_, N = b_shape
d_shape = [M, N]
# create input tensors
left_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
right_tensor = torch.randint(0, 7, b_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
output_tensor = torch.randint(0, 7, d_shape, dtype=torch_dtype, device=xpu_device)
left_tensors.append(left_tensor)
right_tensors.append(right_tensor)
output_tensors.append(output_tensor)
return [left_tensors, right_tensors, output_tensors]
def sin_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
c_shape = a_shape
input_element_num = sum([math.prod(shape) for shape in [a_shape]])
output_element_num = sum([math.prod(shape) for shape in [c_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num
batch_size = c_shape[0]
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def sin_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
c_shape = a_shape
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor, c_tensor]
def cast_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
c_shape = a_shape
input_element_num = sum([math.prod(shape) for shape in [a_shape]])
output_element_num = sum([math.prod(shape) for shape in [c_shape]])
if torch_dtype == torch.float32:
dst_torch_dtype = torch.bfloat16
elif torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
dst_torch_dtype = torch.float32
elif torch_dtype == torch.int8:
dst_torch_dtype = torch.int32
else:
dst_torch_dtype = torch_dtype
src_dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
dst_dtype_size = torch.tensor([], dtype=dst_torch_dtype).element_size()
input_tensor_size = src_dtype_size * input_element_num
output_tensor_size = dst_dtype_size * output_element_num
batch_size = c_shape[0]
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def cast_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
c_shape = a_shape
if torch_dtype == torch.float32:
dst_torch_dtype = torch.bfloat16
elif torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
dst_torch_dtype = torch.float32
elif torch_dtype == torch.int8:
dst_torch_dtype = torch.int32
else:
dst_torch_dtype = torch_dtype
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
c_tensor = torch.randint(0, 7, c_shape, dtype=dst_torch_dtype, device=xpu_device)
return [a_tensor, c_tensor]
def swiglu_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
input_tensor_shape = [batch_size, hidden_size]
output_tensor_shape = [batch_size, hidden_size]
input_element_num = sum([math.prod(shape) for shape in [input_tensor_shape]])
output_element_num = sum([math.prod(shape) for shape in [output_tensor_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def swiglu_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
input_tensor_shape = [batch_size, hidden_size]
output_tensor_shape = [batch_size, hidden_size]
# create input tensors
input_tensor = torch.randint(0, 7, input_tensor_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
output_tensor = torch.randint(0, 7, output_tensor_shape, dtype=torch_dtype, device=xpu_device)
return [input_tensor, output_tensor]
def add_compute_size(input_shapes, torch_dtype):
a_shape, b_shape = input_shapes
c_shape = a_shape
batch_size, hidden_size = a_shape
input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
output_element_num = sum([math.prod(shape) for shape in [c_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def add_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, b_shape = input_shapes
c_shape = a_shape
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
b_tensor = torch.randint(0, 7, b_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor, b_tensor, c_tensor]
def layer_norm_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = a_shape
w_shape = a_shape[-1:]
input_element_num = sum([math.prod(shape) for shape in [a_shape, w_shape]])
output_element_num = sum([math.prod(shape) for shape in [c_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def layer_norm_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = a_shape
w_shape = a_shape[-1:]
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
# create weight tensors
w_tensor = torch.randint(0, 7, w_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor, c_tensor, w_tensor]
def softmax_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = a_shape
input_element_num = sum([math.prod(shape) for shape in [a_shape]])
output_element_num = sum([math.prod(shape) for shape in [c_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def softmax_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = a_shape
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor, c_tensor]
def reduce_sum_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = [batch_size, 1]
input_element_num = sum([math.prod(shape) for shape in [a_shape]])
output_element_num = sum([math.prod(shape) for shape in [c_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def reduce_sum_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = [batch_size, 1]
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor, c_tensor]
def reduce_min_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
values_shape = [batch_size, 1]
indices_shape = [batch_size, 1]
input_element_num = sum([math.prod(shape) for shape in [a_shape]])
values_element_num = sum([math.prod(shape) for shape in [values_shape]])
indices_element_num = sum([math.prod(shape) for shape in [indices_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
indices_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * values_element_num + indices_dtype_size * indices_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def reduce_min_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
values_shape = [batch_size, 1]
indices_shape = [batch_size, 1]
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
values_tensor = torch.randint(0, 7, values_shape, dtype=torch_dtype, device=xpu_device)
indices_tensor = torch.randint(0, 7, indices_shape, dtype=torch.int64, device=xpu_device)
return [a_tensor, values_tensor, indices_tensor]
def index_add_compute_size(input_shapes, torch_dtype):
# src_tensor -->(index_tensor) dst_tensor
dst_shape, src_shape = input_shapes
src_batch_size = src_shape[0]
dst_batch_size = dst_shape[0]
index_shape = [src_batch_size]
src_element_num = sum([math.prod(shape) for shape in [src_shape]])
index_element_num = sum([math.prod(shape) for shape in [index_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
index_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
src_tensor_size = dtype_size * src_element_num
index_tensor_size = index_dtype_size * index_element_num
input_tensor_size = 2 * src_tensor_size + index_tensor_size
output_tensor_size = src_tensor_size
tensor_size = input_tensor_size + output_tensor_size
return src_batch_size, tensor_size, input_tensor_size, output_tensor_size
def index_add_create_tensors(input_shapes, torch_dtype, xpu_device):
# src_tensor -->(index_tensor) dst_tensor
dst_shape, src_shape = input_shapes
src_batch_size = src_shape[0]
dst_batch_size = dst_shape[0]
index_shape = [src_batch_size]
# create output tensors
dst_tensor = torch.randint(0, 7, dst_shape, dtype=torch_dtype, device=xpu_device)
# create input tensors
src_tensor = torch.randint(0, 7, src_shape, dtype=torch_dtype, device=xpu_device)
index_tensor = torch.randint(0, dst_batch_size, index_shape, dtype=torch.int64, device=xpu_device)
return [dst_tensor, src_tensor, index_tensor]
def sort_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = a_shape
input_element_num = sum([math.prod(shape) for shape in [a_shape]])
output_element_num = sum([math.prod(shape) for shape in [c_shape]])
indice_element_num = output_element_num
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
indice_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num + indice_dtype_size * indice_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def sort_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = a_shape
# create input tensors
a_tensor = torch.randint(0, 7, a_shape, dtype=torch_dtype, device=xpu_device)
# create output tensors
c_tensor = torch.randint(0, 7, c_shape, dtype=torch_dtype, device=xpu_device)
indice_tensor = torch.randint(0, 7, c_shape, dtype=torch.int64, device=xpu_device)
return [a_tensor, c_tensor, indice_tensor]
def unique_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = a_shape
input_element_num = sum([math.prod(shape) for shape in [a_shape]])
output_element_num = sum([math.prod(shape) for shape in [c_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
indice_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num + indice_dtype_size * output_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def unique_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
c_shape = a_shape
# create input tensors
torch.manual_seed(1)
a_tensor = torch.randint(0, 1024, a_shape, dtype=torch_dtype, device="cpu").to(device=xpu_device)
# create output tensors
c_tensor = torch.empty(c_shape, dtype=torch_dtype, device=xpu_device)
count_tensor = torch.empty(c_shape, dtype=torch.int64, device=xpu_device)
return [a_tensor, c_tensor, count_tensor]
def scatter_compute_size(input_shapes, torch_dtype):
tensor_shape = input_shapes[0]
batch_size, hidden_size = tensor_shape
index_shape = [batch_size]
input_element_num = sum([math.prod(shape) for shape in [tensor_shape]])
output_element_num = sum([math.prod(shape) for shape in [tensor_shape]])
index_element_num = sum([math.prod(shape) for shape in [index_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
index_dtype_size = torch.tensor([], dtype=torch.int64).element_size()
input_element_num = dtype_size * input_element_num + index_dtype_size * index_element_num
output_element_num = dtype_size * output_element_num
tensor_size = input_element_num + output_element_num
return batch_size, tensor_size, input_element_num, output_element_num
def scatter_create_tensors(input_shapes, torch_dtype, xpu_device):
tensor_shape = input_shapes[0]
batch_size, hidden_size = tensor_shape
index_shape = [batch_size]
# create output tensors
dst_tensor = torch.randint(0, 7, tensor_shape, dtype=torch_dtype, device=xpu_device)
# create input tensors
src_tensor = torch.randint(0, 7, tensor_shape, dtype=torch_dtype, device=xpu_device)
index = [i for i in range(batch_size)]
random.shuffle(index)
index_tensor = torch.tensor(index, dtype=torch.int64, device=xpu_device)
index_tensor = index_tensor.reshape(-1, 1).expand(-1, hidden_size)
return [dst_tensor, src_tensor, index_tensor]
def host2device_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
output_element_num = sum([math.prod(shape) for shape in [a_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
output_tensor_size = dtype_size * output_element_num
tensor_size = output_tensor_size
return batch_size, tensor_size, 0, output_tensor_size
def host2device_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
host_tensor = torch.empty(a_shape, dtype=torch_dtype, device="cpu").pin_memory()
device_tensor = torch.empty(a_shape, dtype=torch_dtype, device=xpu_device)
return [host_tensor, device_tensor]
def allreduce_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
a_tensor = torch.zeros(a_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor]
def allgather_compute_size(input_shapes, torch_dtype):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
output_element_num = sum([math.prod(shape) for shape in [a_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
output_tensor_size = dtype_size * output_element_num
tensor_size = output_tensor_size
return batch_size, tensor_size, 0, output_tensor_size
def allgather_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, = input_shapes
batch_size, hidden_size = a_shape
world_size = dist.get_world_size()
tensor = torch.empty([batch_size, hidden_size], dtype=torch_dtype, device=xpu_device)
tensors = list(torch.chunk(tensor, world_size, dim=0))
return [tensors]
def alltoall_compute_size(input_shapes, torch_dtype):
a_shape, b_shape = input_shapes
batch_size, hidden_size = a_shape
world_size = dist.get_world_size()
output_element_num = sum([math.prod(shape) for shape in [a_shape]]) * 2
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
output_tensor_size = dtype_size * output_element_num
tensor_size = output_tensor_size
return batch_size, tensor_size, 0, output_tensor_size
def alltoall_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, b_shape = input_shapes
batch_size, hidden_size = a_shape
world_size = dist.get_world_size()
input_tensor = torch.empty([batch_size, hidden_size], dtype=torch_dtype, device=xpu_device)
input_tensors = list(torch.chunk(input_tensor, world_size, dim=0))
output_tensor = torch.empty([batch_size, hidden_size], dtype=torch_dtype, device=xpu_device)
output_tensors = list(torch.chunk(output_tensor, world_size, dim=0))
return [input_tensors, output_tensors]
def p2p_compute_size(input_shapes, torch_dtype):
a_shape, b_shape = input_shapes
batch_size, hidden_size = a_shape
input_element_num = sum([math.prod(shape) for shape in [a_shape]])
output_element_num = sum([math.prod(shape) for shape in [b_shape]])
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
input_tensor_size = dtype_size * input_element_num
output_tensor_size = dtype_size * output_element_num
tensor_size = input_tensor_size + output_tensor_size
return batch_size, tensor_size, input_tensor_size, output_tensor_size
def p2p_create_tensors(input_shapes, torch_dtype, xpu_device):
a_shape, b_shape = input_shapes
batch_size, hidden_size = a_shape
a_tensor = torch.empty(a_shape, dtype=torch_dtype, device=xpu_device)
b_tensor = torch.empty(b_shape, dtype=torch_dtype, device=xpu_device)
return [a_tensor, b_tensor]
"""
gemm ops
"""
class GemmOp(torch.nn.Module):
def forward(self, input_tensor_a, input_tensor_b, input_tensor_d):
compute_dtype = input_tensor_a.dtype
if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
torch.mm(input_tensor_a, input_tensor_b, out=input_tensor_d)
else:
raise Exception(f"GemmOp with dtype {compute_dtype} is not implemented")
class BatchGemmOp(torch.nn.Module):
def forward(self, input_tensor_a, input_tensor_b, input_tensor_d):
compute_dtype = input_tensor_a.dtype
if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
torch.bmm(input_tensor_a, input_tensor_b, out=input_tensor_d)
else:
raise Exception(f"BatchGemmOp with dtype {compute_dtype} is not implemented")
class GroupGemmOp(torch.nn.Module):
def forward(self, input_tensor_a, input_tensor_b, input_tensor_d):
compute_dtype = input_tensor_a[0].dtype
for a, b, d in zip(input_tensor_a, input_tensor_b, input_tensor_d):
if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
torch.mm(a, b, out=d)
else:
raise Exception(f"GroupGemmOp with dtype {compute_dtype} is not implemented")
"""
unary ops
"""
class SinOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
torch.sin(input_tensor, out=output_tensor)
class CosOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
torch.cos(input_tensor, out=output_tensor)
class ExpOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
torch.exp(input_tensor, out=output_tensor)
class ExponentialOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
input_tensor.exponential_()
class LogOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
torch.log(input_tensor, out=output_tensor)
class SqrtOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
torch.sqrt(input_tensor, out=output_tensor)
class CastOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
output_tensor = input_tensor.to(output_tensor.dtype)
class SiluOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
output_tensor = torch.nn.functional.silu(input_tensor)
class GeluOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
output_tensor = torch.nn.functional.gelu(input_tensor)
class SwiGLUOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
torch.mul(torch.nn.functional.silu(input_tensor), input_tensor, out=output_tensor)
"""
Binary ops
"""
class AddOp(torch.nn.Module):
def forward(self, input_tensor_a, input_tensor_b, input_tensor_c):
torch.add(input_tensor_a, input_tensor_b, out=input_tensor_c)
class MulOp(torch.nn.Module):
def forward(self, input_tensor_a, input_tensor_b, input_tensor_c):
torch.mul(input_tensor_a, input_tensor_b, out=input_tensor_c)
class SubOp(torch.nn.Module):
def forward(self, input_tensor_a, input_tensor_b, input_tensor_c):
torch.sub(input_tensor_a, input_tensor_b, out=input_tensor_c)
class DivOp(torch.nn.Module):
def forward(self, input_tensor_a, input_tensor_b, input_tensor_c):
torch.div(input_tensor_a, input_tensor_b, out=input_tensor_c)
"""
reduction ops
"""
class LayerNormOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor, weight_tensor):
output_tensor = torch.nn.functional.layer_norm(input_tensor, (input_tensor.shape[-1],), weight_tensor)
class SoftmaxOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
output_tensor = torch.nn.functional.softmax(input_tensor, dim=-1, dtype=output_tensor.dtype)
class ReduceSumOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor):
torch.sum(input_tensor, dim=-1, keepdim=True, dtype=output_tensor.dtype, out=output_tensor)
class ReduceMinOp(torch.nn.Module):
def forward(self, input_tensor, value_tensor, indice_tensor):
torch.min(input_tensor, dim=-1, keepdim=True, out=(value_tensor, indice_tensor))
class ReduceMaxOp(torch.nn.Module):
def forward(self, input_tensor, value_tensor, indice_tensor):
torch.max(input_tensor, dim=-1, keepdim=True, out=(value_tensor, indice_tensor))
"""
index_ops
"""
class IndexAddOp(torch.nn.Module):
def forward(self, dst_tensor, src_tensor, index_tensor):
dst_tensor.index_add_(0, index_tensor, src_tensor)
class SortOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor, indice_tensor):
torch.sort(input_tensor, dim=-1, out=(output_tensor, indice_tensor))
class UniqueOp(torch.nn.Module):
def forward(self, input_tensor, output_tensor, count_tensor):
output_tensor, count_tensor = torch.unique(
input=input_tensor,
sorted=False,
return_counts=True,
return_inverse=False
)
class ScatterOp(torch.nn.Module):
def forward(self, dst_tensor, src_tensor, index_tensor):
dst_tensor.scatter_(0, index_tensor, src_tensor)
class GatherOp(torch.nn.Module):
def forward(self, dst_tensor, src_tensor, index_tensor):
torch.gather(src_tensor, 0, index_tensor, out=dst_tensor)
"""
h2d_ops
"""
class Host2DeviceOp(torch.nn.Module):
def forward(self, host_tensor, device_tensor):
device_tensor.copy_(host_tensor)
class Device2HostOp(torch.nn.Module):
def forward(self, host_tensor, device_tensor):
host_tensor.copy_(device_tensor)
"""
communication ops
"""
class AllReduceOp(torch.nn.Module):
def forward(self, input_tensor):
dist.all_reduce(input_tensor, op=dist.ReduceOp.SUM)
class AllGatherOp(torch.nn.Module):
def forward(self, input_tensors):
dist.all_gather(input_tensors, input_tensors[dist.get_rank()])
class ReduceScatterOp(torch.nn.Module):
def forward(self, input_tensors):
dist.reduce_scatter(input_tensors[dist.get_rank()], input_tensors)
class AllToAllOp(torch.nn.Module):
def forward(self, input_tensors, output_tensors):
dist.all_to_all(output_tensors, input_tensors)
class BroadcastOp(torch.nn.Module):
def forward(self, input_tensor):
dist.broadcast(input_tensor, 0)
class P2POp(torch.nn.Module):
def forward(self, send_tensor, recv_tensor):
world_size = dist.get_world_size()
rank = dist.get_rank()
reqs = []
if rank != world_size - 1:
reqs.append(dist.isend(send_tensor, (rank + 1) % world_size))
if rank != 0:
reqs.append(dist.irecv(recv_tensor, (rank - 1 + world_size) % world_size))
for req in reqs:
req.wait()
op_registry = {
# gemm ops
"gemm": GemmOp(),
"gemv": GemmOp(),
"batch_gemm": BatchGemmOp(),
"group_gemm": GroupGemmOp(),
# unary ops
"sin": SinOp(),
"cos": CosOp(),
"exp": ExpOp(),
"exponential": ExponentialOp(),
"log": LogOp(),
"sqrt": SqrtOp(),
"cast": CastOp(),
"silu": SiluOp(),
"gelu": GeluOp(),
"swiglu": SwiGLUOp(),
# binary ops
"add": AddOp(),
"sub": SubOp(),
"mul": MulOp(),
"div": DivOp(),
# reduction ops
"layernorm": LayerNormOp(),
"softmax": SoftmaxOp(),
"reduce_sum": ReduceSumOp(),
"reduce_max": ReduceMaxOp(),
"reduce_min": ReduceMinOp(),
# index_ops
"index_add": IndexAddOp(),
"sort": SortOp(),
"unique": UniqueOp(),
"scatter": ScatterOp(),
"gather": GatherOp(),
# h2d_ops
"device2host": Device2HostOp(),
"host2device": Host2DeviceOp(),
# ccl ops
"broadcast": BroadcastOp(),
"allreduce": AllReduceOp(),
"allgather": AllGatherOp(),
"alltoall": AllToAllOp(),
"reducescatter": ReduceScatterOp(),
"p2p": P2POp(),
}
op_compute_size_funcs = {
# gemm_ops
"gemm": gemm_compute_size,
"gemv": gemm_compute_size,
"batch_gemm": batch_gemm_compute_size,
"group_gemm": group_gemm_compute_size,
# unary_ops
"sin": sin_compute_size,
"cos": sin_compute_size,
"exp": sin_compute_size,
"exponential": sin_compute_size,
"log": sin_compute_size,
"sqrt": sin_compute_size,
"cast": cast_compute_size,
"silu": sin_compute_size,
"gelu": sin_compute_size,
"swiglu": swiglu_compute_size,
# binary_ops
"add": add_compute_size,
"mul": add_compute_size,
"sub": add_compute_size,
"div": add_compute_size,
# reduction_ops
"layernorm": layer_norm_compute_size,
"softmax": softmax_compute_size,
"reduce_sum": reduce_sum_compute_size,
"reduce_min": reduce_min_compute_size,
"reduce_max": reduce_min_compute_size,
# index_ops
"index_add": index_add_compute_size,
"sort": sort_compute_size,
"unique": unique_compute_size,
"scatter": scatter_compute_size,
"gather": scatter_compute_size,
# h2d_ops
"host2device": host2device_compute_size,
"device2host": host2device_compute_size,
# ccl_ops
"broadcast": host2device_compute_size,
"allreduce": host2device_compute_size,
"allgather": allgather_compute_size,
"alltoall": alltoall_compute_size,
"reducescatter": allgather_compute_size,
"p2p": p2p_compute_size,
}
op_create_tensors_funcs = {
# gemm ops
"gemm": gemm_create_tensors,
"gemv": gemm_create_tensors,
"batch_gemm": batch_gemm_create_tensors,
"group_gemm": group_gemm_create_tensors,
# unary ops
"sin": sin_create_tensors,
"cos": sin_create_tensors,
"exp": sin_create_tensors,
"exponential": sin_create_tensors,
"log": sin_create_tensors,
"sqrt": sin_create_tensors,
"cast": cast_create_tensors,
"silu": sin_create_tensors,
"gelu": sin_create_tensors,
"swiglu": swiglu_create_tensors,
# binary ops
"add": add_create_tensors,
"mul": add_create_tensors,
"sub": add_create_tensors,
"div": add_create_tensors,
# reduction ops
"layernorm": layer_norm_create_tensors,
"softmax": softmax_create_tensors,
"reduce_sum": reduce_sum_create_tensors,
"reduce_min": reduce_min_create_tensors,
"reduce_max": reduce_min_create_tensors,
# index ops
"index_add": index_add_create_tensors,
"sort": sort_create_tensors,
"unique": unique_create_tensors,
"scatter": scatter_create_tensors,
"gather": scatter_create_tensors,
# h2d_ops
"host2device": host2device_create_tensors,
"device2host": host2device_create_tensors,
# ccl_ops
"broadcast": allreduce_create_tensors,
"allreduce": allreduce_create_tensors,
"allgather": allgather_create_tensors,
"alltoall": alltoall_create_tensors,
"reducescatter": allgather_create_tensors,
"p2p": p2p_create_tensors,
}
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import List
import numpy as np
import torch
from backends import module_store
def dump_communication_ops_report(
op_name: str,
torch_dtype,
input_shapes: List[List[int]],
compute_size_func,
group_size: int,
bandwidth_limit: float,
latency: float,
error: str = ""
):
# get dtype name and dtype_size
dtype_name = str(torch_dtype).split(".")[-1]
dtype_size = torch.tensor([], dtype=torch_dtype).element_size()
element_num = math.prod(input_shapes[0])
tensor_size = dtype_size * element_num
mb = tensor_size / 1024 / 1024
if error == "":
algo_bw = tensor_size / latency / 1e3
"""
allreduce: 2 * (group_size - 1) * (tensor_size / group_size)
allgather: 1 * (group_size - 1) * (tensor_size / group_size)
reducescatter: 1 * (group_size - 1) * (tensor_size / group_size)
alltoall: 1 * (group_size - 1) * (tensor_size / group_size)
broadcast: tensor_size
p2p: tensor_size
"""
if op_name in ["allgather", "reducescatter", "alltoall"]:
bus_bw = algo_bw * (group_size - 1) / group_size
elif op_name in ["allreduce"]:
bus_bw = 2 * algo_bw * (group_size - 1) / group_size
elif op_name in ["broadcast", "p2p", "device2host", "host2device"]:
bus_bw = algo_bw
bandwidth_utils = None
if bandwidth_limit is not None:
bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
report = {
"Dtype": str(dtype_name),
"Tensor Shapes": input_shapes,
"Memory Size(MB)": round(mb, 2),
"Group": group_size,
"Kernel bandwidth(GB/s)": round(algo_bw, 2),
"Bus bandwidth(GB/s)": round(bus_bw, 2),
"Bandwidth Utilization(%)": bandwidth_utils,
"Avg latency(us)": round(latency, 2),
}
else:
report = {
"Dtype": str(dtype_name),
"Tensor Shapes": input_shapes,
"Memory Size(MB)": round(mb, 2),
"Group": group_size,
"Kernel bandwidth(GB/s)": 0,
"Bus bandwidth(GB/s)": 0,
"Bandwidth Utilization(%)": None,
"Avg latency(us)": 0,
"Error": error,
}
return report
def dump_computation_ops_report(
op_name: str,
torch_dtype: str,
input_shapes: List[List[int]],
compute_size_func,
bandwidth_limit: float,
latency: float,
error: str = ""
):
# get dtype name and dtype_size
dtype_name = str(torch_dtype).split(".")[-1]
batch_size, tensor_size, input_tensor_size, output_tensor_size = compute_size_func(input_shapes, torch_dtype)
if error == "":
qps = round(1e6 / latency * batch_size, 2)
algo_bw = tensor_size / latency / 1e3
bandwidth_utils = None
if bandwidth_limit is not None:
bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
report = {
"Dtype": str(dtype_name),
"Tensor Shapes": input_shapes,
"Read IO Size(MB)": round(input_tensor_size / 1024 / 1024, 2),
"Write IO Size(MB)": round(output_tensor_size / 1024 / 1024, 2),
"Memory Size(MB)": round(tensor_size / 1024 / 1024, 2),
"Kernel bandwidth(GB/s)": round(algo_bw, 2),
"Bandwidth Utilization(%)": bandwidth_utils,
"Avg latency(us)": round(latency, 2),
"QPS": qps,
}
else:
report = {
"Dtype": str(dtype_name),
"Tensor Shapes": input_shapes,
"Read IO Size(MB)": round(input_tensor_size / 1024 / 1024, 2),
"Write IO Size(MB)": round(output_tensor_size / 1024 / 1024, 2),
"Memory Size(MB)": round(tensor_size / 1024 / 1024, 2),
"Kernel bandwidth(GB/s)": 0,
"Bandwidth Utilization(%)": None,
"Avg latency(us)": 0,
"QPS": 0,
"Error": error,
}
return report
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import json
import time
import datetime
import signal
import argparse
import importlib
import logging
import subprocess
import pathlib
import traceback
import random
from typing import Any, Dict, List
import itertools
from collections import namedtuple
import torch.distributed
import torch.multiprocessing as mp
import virtualenv
import torch
# directory config
CUR_DIR = pathlib.Path.cwd().absolute()
FILE_DIR = pathlib.Path(__file__).parent.absolute()
BYTE_MLPERF_ROOT = FILE_DIR.parent
sys.path.insert(0, str(BYTE_MLPERF_ROOT))
# logger config
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("PerfEngine")
def get_args():
parser = argparse.ArgumentParser()
# hardware config
parser.add_argument(
"--hardware_type",
default="GPU",
help="The backend going to be evaluted, refs to backends/",
)
parser.add_argument(
"--vendor_path",
help="The hardware configs need to be loaded, refs to vendor_zoo/NVIDIA/A100-PCIe.json",
)
# task config
parser.add_argument(
"--task_dir",
default=str(BYTE_MLPERF_ROOT.joinpath("workloads")),
help="The direcotry of tasks going to be evaluted, e.g., set to workloads"
)
parser.add_argument(
"--task",
default="gemm",
help="The task going to be evaluted, refs to workloads/",
)
# feature control
parser.add_argument(
"--parallel",
type=int, default=1,
help="Run all tasks in parallel if available"
)
parser.add_argument(
"--activate_venv",
action="store_true",
help="Enable virtual environment to run the task",
)
args = parser.parse_args()
return args
def load_workload(task: str, task_dir: str) -> Dict[str, Any]:
"""
Return a list of dictionary with model Configuration
Args: List[str]
Returns: List[dic]
"""
modules_dir = pathlib.Path(task_dir).absolute()
# create empty workload json data
workload_dict = {}
for file in modules_dir.iterdir():
if (
file.stem.startswith('_')
or file.stem.startswith('.')
or file.is_dir()
or file.suffix != '.json'
or file.stem != task
):
continue
workload_dict = json.loads(file.read_text())
if not workload_dict:
logger.error(f"could not find {task}.json in {modules_dir}.")
exit(1)
return workload_dict
def parse_workload(workload):
shape_list = []
if "input_shape_groups" in workload:
input_shape_groups = workload["input_shape_groups"] if isinstance(workload["input_shape_groups"], list) else [workload["input_shape_groups"]]
for input_shape_group in input_shape_groups:
if "inputs" in input_shape_group:
input_shape_list = []
for input_shapes in input_shape_group["inputs"]:
input_shape_list.append([list(shape) for shape in itertools.product(*input_shapes)])
if len(input_shape_list) == 1:
shape_list.extend(input_shape_list[0])
else:
shape_list.extend([list(input_shape) for input_shape in zip(*input_shape_list)])
else:
gemm_keys = ["M", "K", "N", "MN", "MK", "KN"]
gemm_values = [input_shape_group.get(k, []) for k in gemm_keys]
if any(gemm_values):
m ,k, n, mn, mk, kn = gemm_values
# batch gemm
if "batch_size" in input_shape_group:
bs = input_shape_group.get("batch_size", [])
if m and n and k:
for p in itertools.product(bs, m, k, n):
shape_list.append([[p[0], p[1], p[2]], [p[0], p[2], p[3]]])
if mn and k:
for p in itertools.product(bs, mn, k):
shape_list.append([[p[0], p[1][0], p[2]], [p[0], p[2], p[1][1]]])
if mk and n:
for p in itertools.product(bs, mk, n):
shape_list.append([[p[0], p[1][0], p[1][1]], [p[0], p[1][1], p[2]]])
if m and kn:
for p in itertools.product(bs, m, kn):
shape_list.append([[p[0], p[1], p[2][0]], [p[0], p[2][0], p[2][1]]])
# group gemm
elif "gemm_group" in input_shape_group:
groups = input_shape_group.get("gemm_group", [])
batches = input_shape_group.get("batch", [])
kn = input_shape_group.get("KN", [])
if k and n:
kn.append([list(shape) for shape in itertools.product(k, n)])
for batch in batches:
for _kn in kn:
group_input_shape_list = []
for group in groups:
group_input_shape_list.append([[group * batch, _kn[0]], [_kn[0], _kn[1]]])
shape_list.append(group_input_shape_list)
# gemm
else:
if m and n and k:
for p in itertools.product(m, k, n):
shape_list.append([[p[0], p[1]], [p[1], p[2]]])
if mn and k:
for p in itertools.product(mn, k):
shape_list.append([[p[0][0], p[1]], [p[1], p[0][1]]])
if mk and n:
for p in itertools.product(mk, n):
shape_list.append([[p[0][0], p[0][1]], [p[0][1], p[1]]])
if m and kn:
for p in itertools.product(m, kn):
shape_list.append([[p[0], p[1][0]], [p[1][0], p[1][1]]])
return shape_list
ConfigInstance = namedtuple("ConfigInstance", ["dtype", "tensor_shapes", "index", "total"])
ResultItem = namedtuple("ResultItem", ["config", "report"])
class PerfEngine:
def __init__(self) -> None:
super().__init__()
self.args = get_args()
self.workload = load_workload(self.args.task, self.args.task_dir)
self.backend_type = self.args.hardware_type
self.old_os_path = os.environ["PATH"]
self.prev_sys_path = list(sys.path)
self.real_prefix = sys.prefix
self.version = self.get_version()
def get_version(self):
version = ""
try:
version_file = os.path.join(str(BYTE_MLPERF_ROOT), "../VERSION")
with open(version_file) as f:
_version = f.read().splitlines()
version = '.'.join(v.split('=')[1] for v in _version)
except Exception as e:
traceback.print_exc()
logger.warning(f"get bytemlperf version failed, error msg: {e}")
return version
def get_cpu_name(self):
command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
cpu_name = subprocess.check_output(command, shell=True)
return cpu_name.decode().strip()
def start_engine(self) -> None:
if self.args.activate_venv:
self.activate_venv(self.backend_type)
# init backend
hardware_type = self.backend_type
logger.info("Loading Heterogeneous Backend: {}".format(hardware_type))
backend_module = importlib.import_module(
"backends." + hardware_type + ".backend_" + hardware_type.lower())
self.backend_class = getattr(backend_module, "Backend" + hardware_type)
self.backend = self.backend_class(self.workload, self.args.vendor_path)
# create output dir based on task
# {BYTEMLPERF_ROOT}/byte_micro_perf/reports/{backend_type}/{task_name}
hardware_reports_dir = BYTE_MLPERF_ROOT.joinpath(
"reports", self.backend_type
)
output_dir = BYTE_MLPERF_ROOT.joinpath(
"reports", self.backend_type,
self.workload["operator"]
)
output_dir.mkdir(parents=True, exist_ok=True)
# get input shape info
target_group_list = self.workload.get("group", [1])
target_group_list.sort()
device_count = getattr(self.backend, "get_device_count")()
group_list = []
for group in target_group_list:
if group <= device_count:
group_list.append(group)
else:
break
dtype_list = self.workload.get("dtype", ["float32"])
shape_list = parse_workload(self.workload)
if not group_list or not dtype_list or not shape_list:
logger.error("empty group/dtype/shape")
exit(1)
test_list = []
case_index = 0
for dtype in dtype_list:
for shape in shape_list:
test_list.append(ConfigInstance(dtype, shape, case_index + 1, len(dtype_list) * len(shape_list)))
case_index = case_index + 1
try:
mp.set_start_method("spawn", force=True)
except Exception as e:
traceback.print_exc()
logger.error(f"Set start method failed, error msg: {e}")
# terminate subprocesses
subprocess_pids = []
def signal_handler(signum, frame):
logger.info(f"Received signal {signum}, exiting...")
if subprocess_pids:
for pid in subprocess_pids:
logger.info(f"terminate subprocess: {pid}")
os.kill(pid, signal.SIGTERM)
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# all operations will enter subprocess to test in parallel
for group in group_list:
logger.info(f"Start to test group size: {group}")
instance_num = min(device_count, max(1, self.args.parallel)) if group == 1 else group
if self.workload["operator"] in ["device2host", "host2device"]:
instance_num = 1
input_queues = mp.Queue()
output_queues = mp.Queue(maxsize=1)
try:
_subprocesses = mp.spawn(
fn=self.perf_func,
args=(instance_num, group, output_dir, test_list, input_queues, output_queues),
nprocs=instance_num,
join=False,
daemon=False
)
subprocess_pids = _subprocesses.pids()
for _ in range(instance_num):
assert "ready" == output_queues.get()
logger.info("all ranks are ready and listening, init done")
start_time = time.perf_counter_ns()
if group == 1:
for test_instance in test_list:
input_queues.put(test_instance, False)
for _ in range(instance_num):
input_queues.put(None, False)
result_list = []
if group == 1:
for _ in range(instance_num):
result_list.extend(output_queues.get())
elif group > 1:
result_list.extend(output_queues.get())
result_list = sorted(result_list, key=lambda x: x.config.index)
dtype_results_mapping = {}
for result in result_list:
if result.config.dtype not in dtype_results_mapping:
dtype_results_mapping[result.config.dtype] = []
dtype_results_mapping[result.config.dtype].append(result)
for dtype, results in dtype_results_mapping.items():
dtype_results_mapping[dtype] = sorted(results, key=lambda x: x.config.index)
base_report = {
"Operator": self.workload["operator"].upper(),
"Backend": self.backend_type,
"Host Info": self.get_cpu_name(),
"Device Info": getattr(self.backend, "get_device_name")(),
"Version": self.version,
"Execution Date": time.strftime("%Y-%m-%d %H:%M:%S"),
"Performance": [result.report for result in dtype_results_mapping[dtype]]
}
filename = (
f"result-{str(dtype)}"
+ (
f"-group{group}"
if group > 1
else ""
)
+ ".json"
)
filepath = output_dir.joinpath(filename)
with open(filepath, "w") as f:
json.dump(base_report, f, indent=4)
for process in _subprocesses.processes:
process.join()
end_time = time.perf_counter_ns()
duration = (end_time - start_time) / 1e9
duration = round(duration, 3)
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
ret_code = 0
for process in _subprocesses.processes:
if process.exitcode != 0:
ret_code = process.exitcode
break
if ret_code != 0:
with open(f"{hardware_reports_dir}/_run_report.log", "a") as f:
print(f"[failed] {self.args.task}, group_size={group}, {current_time}, {duration} s", file=f)
else:
with open(f"{hardware_reports_dir}/_run_report.log", "a") as f:
print(f"[success] {self.args.task}, group_size={group}, {current_time}, {duration} s", file=f)
except Exception as e:
traceback.print_exc()
logger.error(f"Execute task: {self.args.task} failed, group: {group}, error msg: {e}")
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(f"{hardware_reports_dir}/_run_report.log", "a") as f:
print(f"[error] {self.args.task}, group_size={group}, {current_time}", file=f)
subprocess_pids = []
time.sleep(1)
if self.args.activate_venv:
self.deactivate_venv()
def perf_func(self, rank: int, *args):
world_size, group_size, output_dir, test_list, input_queues, output_queues = args
backend_instance = self.backend_class(self.workload, self.args.vendor_path)
backend_instance.rank = rank
backend_instance.world_size = world_size
backend_instance.set_device(rank)
if group_size > 1:
backend_instance.initialize_ccl(rank, world_size)
op_name = self.workload["operator"]
backend_instance.get_op_instance()
output_queues.put("ready")
result_list = []
if group_size == 1:
while True:
test_instance = input_queues.get()
if test_instance is None:
break
test_dtype = test_instance.dtype
test_shape = test_instance.tensor_shapes
"""
input_shape could be:
List[int]: single shape. cos
List[List[int]]: multiple inputs. add
List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
"""
if isinstance(test_shape[0], int):
test_shape = [test_shape]
try:
reports = backend_instance.perf(test_shape, test_dtype)
except Exception as e:
traceback.print_exc()
logger.error(f"Execute op: {op_name.lower()} failed, input_shape: {test_shape}, dtype: {test_dtype}, error msg: {e}")
reports = {}
if reports and "Error" not in reports:
result_list.append(ResultItem(test_instance, reports))
latency = reports.get("Avg latency(us)", 0)
kernel_bw = reports.get("Kernel bandwidth(GB/s)", 0)
bus_bw = reports.get("Bus bandwidth(GB/s)", 0)
print(f"rank {rank}, {test_instance}, latency: {latency}\nkernel_bw: {kernel_bw}, bus_bw: {bus_bw}")
else:
print(f"rank {rank}, {test_instance}, error")
output_queues.put(result_list)
elif group_size > 1:
for test_instance in test_list:
test_dtype = test_instance.dtype
test_shape = test_instance.tensor_shapes
"""
input_shape could be:
List[int]: single shape. cos
List[List[int]]: multiple inputs. add
List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
"""
if isinstance(test_shape[0], int):
test_shape = [test_shape]
try:
reports = backend_instance.perf(test_shape, test_dtype)
except Exception as e:
traceback.print_exc()
logger.error(f"Execute op: {op_name.lower()} failed, input_shape: {test_shape}, dtype: {test_dtype}, error msg: {e}")
reports = {}
if reports and "Error" not in reports:
result_list.append(ResultItem(test_instance, reports))
latency = reports.get("Avg latency(us)", 0)
kernel_bw = reports.get("Kernel bandwidth(GB/s)", 0)
bus_bw = reports.get("Bus bandwidth(GB/s)", 0)
if rank == 0:
print(f"rank {rank}, {test_instance}, latency: {latency}\nkernel_bw: {kernel_bw}, bus_bw: {bus_bw}")
else:
if rank == 0:
print(f"rank {rank}, {test_instance}, error")
if rank == 0:
output_queues.put(result_list)
if group_size > 1:
backend_instance.destroy_process_group()
def activate_venv(self, hardware_type: str) -> bool:
if os.path.exists("backends/" + hardware_type + "/requirements.txt"):
logger.info("Activating Virtual Env for " + hardware_type)
venv_dir = os.path.join("backends", hardware_type + "/venv")
activate_file = os.path.join(venv_dir, "bin", "activate_this.py")
if not os.path.exists(venv_dir):
logger.info("venv not exist, Creating Virtual Env for " + hardware_type)
virtualenv.create_environment(venv_dir, True)
exec(open(activate_file).read(), {"__file__": activate_file})
python_path = os.path.join(venv_dir, "bin", "python3")
subprocess.call(
[python_path, "-m", "pip", "install", "--upgrade", "pip", "--quiet"]
)
subprocess.call(
[
python_path,
"-m",
"pip",
"install",
"-r",
"backends/" + hardware_type + "/requirements.txt",
"-q",
]
)
else:
exec(open(activate_file).read(), {"__file__": activate_file})
"""
just in case install failed in pre-run.
"""
python_path = os.path.join(venv_dir, "bin", "python3")
subprocess.call(
[python_path, "-m", "pip", "install", "--upgrade", "pip", "--quiet"]
)
subprocess.call(
[
python_path,
"-m",
"pip",
"install",
"-r",
"backends/" + hardware_type + "/requirements.txt",
"-q",
]
)
if not hasattr(sys, "real_prefix"):
return False
return True
return True
def deactivate_venv(self):
sys.path[:0] = self.prev_sys_path # will also revert the added site-packages
sys.prefix = self.real_prefix
os.environ["PATH"] = self.old_os_path
if __name__ == "__main__":
engine = PerfEngine()
engine.start_engine()
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import argparse
import pathlib
import logging
import subprocess
import signal
# directory config
CUR_DIR = pathlib.Path.cwd().absolute()
FILE_DIR = pathlib.Path(__file__).parent.absolute()
BYTE_MLPERF_ROOT = FILE_DIR
sys.path.insert(0, str(BYTE_MLPERF_ROOT))
# logger config
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("lanuch")
def parse_task(task_dir):
tasks = []
if os.path.isdir(task_dir):
for root, _, files in os.walk(task_dir, topdown=False):
for name in files:
if name.endswith(".json"):
tasks.append(name.rsplit('.', 1)[0])
return tasks
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# hardware config
parser.add_argument(
"--hardware_type",
default="GPU",
help="The backend going to be evaluted, refs to backends/",
)
parser.add_argument(
"--vendor_path",
help="The hardware configs need to be loaded, refs to vendor_zoo/",
)
# task config
parser.add_argument(
"--task_dir",
default=str(BYTE_MLPERF_ROOT.joinpath("workloads").absolute()),
help="The direcotry of tasks going to be evaluted, e.g., set to workloads"
)
parser.add_argument(
"--task",
default="all",
help="The task going to be evaluted, refs to workloads/, default use all tasks in workloads/"
)
# list all supported task and hardware
parser.add_argument(
"--show_task_list",
action="store_true",
help="Print all available task names"
)
parser.add_argument(
"--show_hardware_list",
action="store_true",
help="Print all hardware bytemlperf supported",
)
# feature control
parser.add_argument(
"--parallel",
type=int, default=1,
help="Run all tasks in parallel if available"
)
parser.add_argument(
"--install_requirements", action="store_true",
help="Install all required packages"
)
parser.add_argument(
"--activate_venv", action="store_true",
help="Enable python virtual environment"
)
args = parser.parse_args()
args.vendor_path = pathlib.Path(args.vendor_path).absolute() if args.vendor_path else None
args.task_dir = pathlib.Path(args.task_dir).absolute()
os.chdir(str(BYTE_MLPERF_ROOT))
# show tasks
task_list = [file.stem for file in args.task_dir.iterdir()]
task_list.sort()
task_mapping = {
"all": task_list,
"gemm_ops": [],
"unary_ops": [],
"binary_ops": [],
"reduction_ops": [],
"index_ops": [],
"h2d_ops": [],
"ccl_ops": []
}
for task in task_list:
if task in ["gemm", "gemv", "batch_gemm", "group_gemm"]:
task_mapping["gemm_ops"].append(task)
if task in ["sin", "cos", "exp", "exponential", "log", "sqrt", "cast", "silu", "gelu", "swiglu"]:
task_mapping["unary_ops"].append(task)
if task in ["add", "mul", "sub", "div"]:
task_mapping["binary_ops"].append(task)
if task in ["layernorm", "softmax", "reduce_sum", "reduce_max", "reduce_min"]:
task_mapping["reduction_ops"].append(task)
if task in ["index_add", "sort", "unique", "gather", "scatter"]:
task_mapping["index_ops"].append(task)
if task in ["host2device", "device2host", "device2device"]:
task_mapping["h2d_ops"].append(task)
if task in ["allgather", "allreduce", "alltoall", "broadcast", "p2p", "reduce_scatter"]:
task_mapping["ccl_ops"].append(task)
if args.show_task_list:
logger.info("******************* Supported Task *******************")
print(task_list)
exit(0)
# show hardwares
hardware_list = []
for file in BYTE_MLPERF_ROOT.joinpath("backends").iterdir():
if file.is_dir() and file.stem.startswith("_") is False:
hardware_list.append(file.stem)
if args.show_hardware_list:
logger.info("***************** Supported Hardware Backend *****************")
print(hardware_list)
exit(0)
# check task
test_cases = []
if args.task in task_mapping.keys():
test_cases = task_mapping[args.task]
else:
specified_tasks = args.task.split(",")
for task in specified_tasks:
if task not in task_list:
logger.error(f"Task {task} not found in {args.task_dir}")
exit(1)
test_cases.append(task)
logger.info(f"******************* Tasks: *****************")
logger.info(f"{test_cases}\n")
# check hardware
hardware = args.hardware_type
if hardware not in hardware_list:
logger.error(f"Hardware {hardware} not found in {BYTE_MLPERF_ROOT.joinpath('backends')}")
exit(1)
logger.info(f"******************* hardware: *****************")
logger.info(f"{hardware}\n")
if args.install_requirements:
logger.info("******************* Pip Package Installing *******************")
subprocess.run(
["python3", "-m", "pip", "install", "pip", "--upgrade", "--quiet"]
)
subprocess.run(
["python3", "-m", "pip", "install", "-r", "requirements.txt", "--quiet"]
)
if not args.activate_venv:
subprocess.run(
["python3", "-m", "pip", "install", "-r", f"backends/{hardware}/requirements.txt", "--quiet"]
)
outputs_dir = pathlib.Path(BYTE_MLPERF_ROOT).joinpath("reports", args.hardware_type)
if not outputs_dir.exists():
outputs_dir.mkdir(parents=True)
with open(f"{BYTE_MLPERF_ROOT}/reports/{args.hardware_type}/_run_report.log", "w") as file:
pass
# terminate task perf process
subprocess_pid = -1
def signal_handler(signum, frame):
logger.info(f"Received signal {signum}, exiting...")
if subprocess_pid != -1:
logger.info(f"terminate subprocess: {subprocess_pid}")
os.kill(subprocess_pid, signal.SIGTERM)
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
failed_ops = []
for task in test_cases:
cmds = [
"python3",
"./core/perf_engine.py",
"--hardware_type", args.hardware_type,
"--vendor_path", str(args.vendor_path),
"--task", task,
"--task_dir", str(args.task_dir),
"--parallel", str(args.parallel)
]
if args.activate_venv:
cmds.append("--activate_venv")
print(f"******************************************* Start to test op: [{task}]. *******************************************")
process = subprocess.Popen(cmds)
subprocess_pid = process.pid
ret = process.wait()
if ret != 0:
failed_ops.append(task)
print("")
if failed_ops:
logger.error(f"Failed ops: {failed_ops}")
exit(1)
else:
logger.info("All ops passed")
matplotlib
pandas
virtualenv==16.7.12
scikit-learn
prompt_toolkit
tqdm
opencv-python
transformers
tokenization
fpdf
attrs
decorator
typing-extensions
pydot
\ No newline at end of file
source /home/workspace/dtk-24.04.3/env.sh
python3 ./launch.py --parallel 8
import sys
import csv
import json
import pathlib
import argparse
import logging
CUR_DIR = pathlib.Path(__file__).parent.absolute()
PRJ_ROOT_DIR = CUR_DIR.parent
sys.path.insert(0, str(PRJ_ROOT_DIR))
unique_attrs = [
"op_name",
"sku_name",
"owner",
"perf_mode"
]
def get_unique_key(
op_name,
sku_name,
owner,
perf_mode,
*args,
**kwargs
):
return ".".join([
sku_name,
owner,
op_name,
perf_mode
]).replace(" ", "_")
arguments_map = {
# 单目算子
# [batch, len] --> [batch, len]
"sin": ["dtype", "batch", "len"],
"cos": ["dtype", "batch", "len"],
"exp": ["dtype", "batch", "len"],
"exponential": ["dtype", "batch", "len"],
"silu": ["dtype", "batch", "len"],
"gelu": ["dtype", "batch", "len"],
"swiglu": ["dtype", "batch", "len"],
# float32: float32 --> float16/bfloat16
# float16: float16 --> float32
# bfloat16: bfloat16 --> float32
"cast": ["dtype", "batch", "len"],
# 双目算子
# [batch, len] (op) [batch, len] --> [batch, len]
"add": ["dtype", "batch", "len"],
"mul": ["dtype", "batch", "len"],
"sub": ["dtype", "batch", "len"],
"div": ["dtype", "batch", "len"],
# 规约算子
# [batch, len] --> [batch, len]
"layernorm": ["dtype", "batch", "len"],
"softmax": ["dtype", "batch", "len"],
# [batch, len] --> [batch, 1]
"reduce_sum": ["dtype", "batch", "len"],
"reduce_min": ["dtype", "batch", "len"],
"reduce_max": ["dtype", "batch", "len"],
# 索引算子
# [batch, len] (op) [batch] --> [batch, len]
"index_add": ["dtype", "batch", "len"],
# [batch, len] --> [batch, len]
"sort": ["dtype", "batch", "len"],
"unique": ["dtype", "batch", "len"],
"gather": ["dtype", "batch", "len"],
"scatter": ["dtype", "batch", "len"],
# 矩阵算子
# [M, K] * [K, N] --> [M, N]
"gemm": ["dtype", "M", "N", "K"],
# [batch, M, K] * [batch, K, N] --> [batch, M, N]
"batch_gemm": ["dtype", "batch", "M", "N", "K"],
# # group * {[M, K] * [K, N] = [M, N]
"group_gemm": ["dtype", "batch", "group", "M_str", "N", "K"],
# 通信算子
# [batch, len] --> [batch, len]
# tp_size split over batch
"broadcast": ["dtype", "tp_size", "batch", "len"],
"allreduce": ["dtype", "tp_size", "batch", "len"],
"allgather": ["dtype", "tp_size", "batch", "len"],
"alltoall": ["dtype", "tp_size", "batch", "len"],
"reducescatter": ["dtype", "tp_size", "batch", "len"],
"p2p": ["dtype", "tp_size", "batch", "len"],
"device2host": ["dtype", "batch", "len"],
"host2device": ["dtype", "batch", "len"]
}
target_attrs = [
# latency in us
"latency"
]
def get_csv_headers(op_name):
return unique_attrs + arguments_map.get(op_name, []) + target_attrs
logger = logging.getLogger("bytemlperf_aeolus")
def setup_logger(loglevel: str):
fmt = logging.Formatter(
fmt="%(asctime)s.%(msecs)03d %(filename)s:%(lineno)d [%(levelname)s]: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
handler = logging.StreamHandler(stream=sys.stdout)
handler.setFormatter(fmt)
logger.addHandler(handler)
logger.setLevel(loglevel.upper())
logger.propagate = False
sku_name_mapping = {
"MLU590-M9": "MLU590 M9",
"MLU590-M9D": "MLU590 M9D",
"MLU590-M9DK": "MLU590 M9D",
"Iluvatar BI-V150": "BI-V150",
"NVIDIA A800-SXM4-80GB": "A800 80GB SXM",
"NVIDIA H800": "H800 80GB SXM",
"NVIDIA H20": "H20 96GB SXM",
"Ascend910B2C": "Ascend910B2"
}
dtype_map = {
"float": "float32",
"half": "float16",
"int": "int32"
}
def normal_ops_func(op, sku_name, frame, perf_mode, json_data):
if not json_data or "Error" in json_data:
return
dtype = json_data["Dtype"]
if dtype in dtype_map:
dtype = dtype_map[dtype]
batch = json_data["Tensor Shapes"][0][0]
len = json_data["Tensor Shapes"][0][1]
latency = json_data["Avg latency(us)"]
return [op, sku_name, frame, perf_mode, dtype, batch, len, latency]
def gemm_func(op, sku_name, frame, perf_mode, json_data):
if not json_data or "Error" in json_data:
return
dtype = json_data["Dtype"]
if dtype in dtype_map:
dtype = dtype_map[dtype]
M = json_data["Tensor Shapes"][0][0]
K = json_data["Tensor Shapes"][0][1]
N = json_data["Tensor Shapes"][1][1]
latency = json_data["Avg latency(us)"]
return [op, sku_name, frame, perf_mode, dtype, M, N, K, latency]
def batch_gemm_func(op, sku_name, frame, perf_mode, json_data):
if not json_data or "Error" in json_data:
return
dtype = json_data["Dtype"]
if dtype in dtype_map:
dtype = dtype_map[dtype]
batch_size = json_data["Tensor Shapes"][0][0]
M = json_data["Tensor Shapes"][0][1]
K = json_data["Tensor Shapes"][0][2]
N = json_data["Tensor Shapes"][1][2]
latency = json_data["Avg latency(us)"]
return [op, sku_name, frame, perf_mode, dtype, batch_size, M, N, K, latency]
def group_gemm_func(op, sku_name, frame, perf_mode, json_data):
if not json_data or "Error" in json_data:
return
dtype = json_data["Dtype"]
if dtype in dtype_map:
dtype = dtype_map[dtype]
batch_size = json_data["Tensor Shapes"][0][0][0]
group = len(json_data["Tensor Shapes"])
M_list = [int(json_data["Tensor Shapes"][i][0][0]) // batch_size for i in range(group)]
M_list_str = "/".join([str(m) for m in M_list])
K = json_data["Tensor Shapes"][0][0][1]
N = json_data["Tensor Shapes"][0][1][1]
latency = json_data["Avg latency(us)"]
return [op, sku_name, frame, perf_mode, dtype, batch_size, group, M_list_str,N, K, latency]
def ccl_ops_func(op, sku_name, frame, perf_mode, json_data):
if not json_data or "Error" in json_data:
return
dtype = json_data["Dtype"]
if dtype in dtype_map:
dtype = dtype_map[dtype]
tp_size = json_data["Group"]
batch = json_data["Tensor Shapes"][0][0]
len = json_data["Tensor Shapes"][0][1]
latency = json_data["Avg latency(us)"]
return [op, sku_name, frame, perf_mode, dtype, tp_size, batch, len, latency]
def d2h_h2d_func(op, sku_name, frame, perf_mode, json_data):
if not json_data or "Error" in json_data:
return
dtype = json_data["Dtype"]
if dtype in dtype_map:
dtype = dtype_map[dtype]
batch = json_data["Tensor Shapes"][0][0]
len = json_data["Tensor Shapes"][0][1]
latency = json_data["Avg latency(us)"]
return [op, sku_name, frame, perf_mode, dtype, batch, len, latency]
post_func_map = {
"sin": normal_ops_func,
"cos": normal_ops_func,
"exp": normal_ops_func,
"exponential": normal_ops_func,
"silu": normal_ops_func,
"gelu": normal_ops_func,
"swiglu": normal_ops_func,
"cast": normal_ops_func,
"add": normal_ops_func,
"mul": normal_ops_func,
"sub": normal_ops_func,
"div": normal_ops_func,
"layernorm": normal_ops_func,
"softmax": normal_ops_func,
"reduce_sum": normal_ops_func,
"reduce_min": normal_ops_func,
"reduce_max": normal_ops_func,
"index_add": normal_ops_func,
"sort": normal_ops_func,
"unique": normal_ops_func,
"gather": normal_ops_func,
"scatter": normal_ops_func,
"gemm": gemm_func,
"batch_gemm": batch_gemm_func,
"group_gemm": group_gemm_func,
"broadcast": ccl_ops_func,
"allreduce": ccl_ops_func,
"allgather": ccl_ops_func,
"alltoall": ccl_ops_func,
"reducescatter": ccl_ops_func,
"p2p": ccl_ops_func,
"device2host": d2h_h2d_func,
"host2device": d2h_h2d_func
}
def postprocess(op, file_list, dst_dir):
json_data_list = [json.load(open(file)) for file in file_list]
if not json_data_list:
logger.error(f"no data found in {file_list}")
return
sku_name = json_data_list[0]["Device Info"]
sku_name = sku_name_mapping.get(sku_name, sku_name)
perf_datas = []
for json_data in json_data_list:
if "Performance" not in json_data:
logger.error(f"no performance data")
continue
perf_data = json_data["Performance"]
if not perf_datas:
perf_datas = perf_data
else:
perf_datas.extend(perf_data)
unique_name = get_unique_key(op, sku_name, "torch", "host")
unique_csv_file = f"{unique_name}.csv"
unique_csv_path = dst_dir / unique_csv_file
with open(unique_csv_path, "w") as f:
writer = csv.writer(f)
writer.writerow(get_csv_headers(op))
for perf_data in perf_datas:
if op in post_func_map:
row = post_func_map[op](op, sku_name, "torch", "host", perf_data)
if row:
writer.writerow(row)
def convert_src(src, dst):
logger.info(f"src: {src}")
logger.info(f"dst: {dst}")
op_data_map = {}
for file in src.rglob("*.json"):
dir_name = file.parent.name
if dir_name == "gemv":
dir_name = "gemm"
if not dir_name in op_data_map:
op_data_map[dir_name] = []
op_data_map[dir_name].append(file)
for op, files in op_data_map.items():
logger.info(f"op: {op}")
if op not in arguments_map and op != "gemv":
logger.error(f"invalid op: {op}")
continue
postprocess(op, files, dst)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--src", type=str, required=True)
parser.add_argument("--output_dir", type=str, default="./temp")
parser.add_argument("--log_level", type=str, default="INFO")
args = parser.parse_args()
setup_logger(args.log_level)
src_dir = pathlib.Path(args.src).absolute()
if not src_dir.exists():
logger.error(f"{args.src} does not exist")
exit(1)
elif not src_dir.is_dir():
logger.error(f"{args.src} is not a directory")
exit(1)
output_dir = pathlib.Path(args.output_dir).absolute()
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)
elif not output_dir.is_dir():
logger.error(f"{args.output_dir} is not a directory")
exit(1)
convert_src(src_dir, output_dir)
{
"operator": "add",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[8192]
],
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[8192]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
]
}
\ No newline at end of file
{
"operator": "allgather",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
[1024]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
],
"group": [
2,
4,
8
]
}
\ No newline at end of file
{
"operator": "allreduce",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
[1024]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
],
"group": [
2,
4,
8
]
}
\ No newline at end of file
{
"operator": "alltoall",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
[1024]
],
[
[8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
[1024]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
],
"group": [
2,
4,
8
]
}
\ No newline at end of file
{
"operator": "batch_gemm",
"iterations": 100,
"input_shape_groups": {
"batch_size": [8, 12, 16, 20, 24, 28, 32],
"M": [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
"KN": [
[1024, 1024],
[4096, 4096],
[8192, 8192],
[16384, 32],
[16384, 128],
[16384, 1024],
[32, 16384],
[128, 16384],
[1024, 16384]]
},
"dtype": [
"float32",
"bfloat16",
"float16",
"int8"
]
}
\ No newline at end of file
{
"operator": "broadcast",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152],
[1024]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
],
"group": [
2,
4,
8
]
}
\ No newline at end of file
{
"operator": "cast",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[8192]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
]
}
\ No newline at end of file
{
"operator": "cos",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[8192]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
]
}
\ No newline at end of file
{
"operator": "device2host",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[1024]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
]
}
\ No newline at end of file
{
"operator": "div",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[8192]
],
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[8192]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
]
}
\ No newline at end of file
{
"operator": "exp",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[8192]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
]
}
\ No newline at end of file
{
"operator": "exponential",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
[8192]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
]
}
\ No newline at end of file
{
"operator": "gather",
"iterations": 100,
"input_shape_groups": {
"inputs": [
[
[1024],
[1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
]
]
},
"dtype": [
"float32",
"bfloat16",
"float16"
]
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment