Commit d2b71343 authored by 雍大凯's avatar 雍大凯
Browse files

add code

parent 69e57885
# Copyright (c) Phigent Robotics. All rights reserved.
import numpy as np
import torch
from . import nearest_assign_ext
__all__ = ['nearest_assign']
class QuickNearestAssignCuda(torch.autograd.Function):
@staticmethod
def forward(ctx,
occ_pred,
l2s_key,
occind2detind,
inst_cls,
inst_xyz,
inst_id_list,
):
occ_pred = occ_pred.contiguous().int()
l2s_key = l2s_key.contiguous().int()
occind2detind = occind2detind.contiguous().int()
inst_cls = inst_cls.contiguous().int()
inst_xyz = inst_xyz.contiguous().int()
inst_id_list = inst_id_list.contiguous().int()
inst_pred = occ_pred.new_zeros(occ_pred.shape)
nearest_assign_ext.nearest_assign_forward(
occ_pred,
l2s_key,
occind2detind,
inst_cls,
inst_xyz,
inst_id_list,
inst_pred
)
return inst_pred
def nearest_assign(occ_pred,
l2s_key,
occind2detind,
inst_cls,
inst_xyz,
inst_id_list):
inst_pred = QuickNearestAssignCuda.apply(occ_pred,
l2s_key,
occind2detind,
inst_cls,
inst_xyz,
inst_id_list
) # (B, Dz, Dy, Dx, C)
return inst_pred
def test_bev_pool_v2():
depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9])
depth = torch.from_numpy(depth).float().cuda()
depth = depth.view(1, 1, 2, 2, 2).requires_grad_()
feat = torch.ones(
size=[1, 1, 2, 2, 2], dtype=torch.float,
device='cuda').requires_grad_()
ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda()
ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda()
ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda()
kept = torch.ones(
ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
interval_starts = torch.where(kept)[0].int()
if len(interval_starts) == 0:
return None, None, None, None, None
interval_lengths = torch.zeros_like(interval_starts)
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
(1, 1, 2, 2, 2), interval_starts, interval_lengths)
loss = torch.sum(bev_feat)
loss.backward()
assert loss == 4.4
grad_depth = np.array([2., 2., 0., 0., 2., 0., 2., 0.])
grad_depth = torch.from_numpy(grad_depth).float()
grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2)
assert depth.grad.allclose(grad_depth)
grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0., 0.])
grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2)
assert feat.grad.allclose(grad_feat)
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
// CUDA function declarations
void nearest_assign(
const int* l2s_key,
int l2s_size,
const int *__restrict__ occind2detind,
int inst_size,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int* __restrict__ inst_pred);
void nearest_assign_forward(
const at::Tensor _occ_pred, // (200, 200, 16)
const at::Tensor _l2s_key, // (l2s_size, 1)
const at::Tensor _occind2detind, // (10, 1)
const at::Tensor _inst_cls, // (inst_size, 1)
const at::Tensor _inst_xyz, // (inst_size, 3)
const at::Tensor _inst_id_list, // (inst_size, 1)
at::Tensor _inst_pred // (200, 200, 16)
) {
int l2s_size = _l2s_key.size(0);
int inst_size = _inst_xyz.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_occ_pred));
const int* occ_pred = _occ_pred.data_ptr<int>();
const int* inst_xyz = _inst_xyz.data_ptr<int>();
const int* inst_cls = _inst_cls.data_ptr<int>();
const int* l2s_key = _l2s_key.data_ptr<int>();
const int* inst_id_list = _inst_id_list.data_ptr<int>();
const int* occind2detind = _occind2detind.data_ptr<int>();
// std::map<int, int> l2s;
// for (int l2s_ind = 0; l2s_ind < l2s_size; l2s_ind++){
// l2s.insert(pair<int, int>(l2s_key[l2s_ind], l2s_val[l2s_ind]));
// }
int* inst_pred = _inst_pred.data_ptr<int>();
nearest_assign(
l2s_key,
l2s_size,
occind2detind,
inst_size,
occ_pred,
inst_xyz,
inst_cls,
inst_id_list,
inst_pred
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nearest_assign_forward", &nearest_assign_forward,
"nearest_assign_forward");
}
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <stdio.h>
#include <stdlib.h>
__global__ void nearest_assign_kernel(
const int* l2s_key,
int l2s_size,
const int* occind2detind,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int inst_size,
int* __restrict__ inst_pred) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// while (idx < 200*200*16)
if (true)
{
int occ_pred_label = occ_pred[idx];
int dist_min = 100000000;
for (int index = 0; index < l2s_size; index ++)
{
if (occ_pred_label == l2s_key[index])
{
int x = idx/(200*16);
int y = (idx - x*200*16)/16;
int z = idx - x*200*16 - y*16;
int inst_ind = 0;
for (inst_ind = 0; inst_ind < inst_size; inst_ind ++)
{
if (inst_cls[inst_ind] == occind2detind[occ_pred_label])
{
int dx = x - inst_xyz[inst_ind*3+0];
int dy = y - inst_xyz[inst_ind*3+1];
int dz = z - inst_xyz[inst_ind*3+2];
int dist = dx*dx + dy*dy + dz*dz;
if (dist < dist_min){
dist_min = dist;
inst_pred[idx] = inst_id_list[inst_ind];
}
}
}
return;
}
}
inst_pred[idx] = occ_pred[idx];
// idx += blockDim.x * gridDim.x;
}
}
void nearest_assign(
const int* l2s_key,
int l2s_size,
const int *__restrict__ occind2detind,
int inst_size,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int* __restrict__ inst_pred) {
// nearest_assign_kernel<<<128, 256>>>(
nearest_assign_kernel<<<(int)ceil(((double)200 * 200 * 16 / 256)), 256>>>(
l2s_key, l2s_size, occind2detind,
occ_pred, inst_xyz, inst_cls,
inst_id_list, inst_size, inst_pred
);
}
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <stdio.h>
#include <stdlib.h>
__global__ void nearest_assign_kernel(
const int* l2s_key,
int l2s_size,
const int* occind2detind,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int inst_size,
int* __restrict__ inst_pred) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// while (idx < 200*200*16)
if (true)
{
int occ_pred_label = occ_pred[idx];
int dist_min = 100000000;
for (int index = 0; index < l2s_size; index ++)
{
if (occ_pred_label == l2s_key[index])
{
int x = idx/(200*16);
int y = (idx - x*200*16)/16;
int z = idx - x*200*16 - y*16;
int inst_ind = 0;
for (inst_ind = 0; inst_ind < inst_size; inst_ind ++)
{
if (inst_cls[inst_ind] == occind2detind[occ_pred_label])
{
int dx = x - inst_xyz[inst_ind*3+0];
int dy = y - inst_xyz[inst_ind*3+1];
int dz = z - inst_xyz[inst_ind*3+2];
int dist = dx*dx + dy*dy + dz*dz;
if (dist < dist_min){
dist_min = dist;
inst_pred[idx] = inst_id_list[inst_ind];
}
}
}
return;
}
}
inst_pred[idx] = occ_pred[idx];
// idx += blockDim.x * gridDim.x;
}
}
void nearest_assign(
const int* l2s_key,
int l2s_size,
const int *__restrict__ occind2detind,
int inst_size,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int* __restrict__ inst_pred) {
// nearest_assign_kernel<<<128, 256>>>(
hipLaunchKernelGGL(( nearest_assign_kernel), dim3((int)ceil(((double)200 * 200 * 16 / 256))), dim3(256), 0, 0,
l2s_key, l2s_size, occind2detind,
occ_pred, inst_xyz, inst_cls,
inst_id_list, inst_size, inst_pred
);
}
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <torch/torch.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
// CUDA function declarations
void nearest_assign(
const int* l2s_key,
int l2s_size,
const int *__restrict__ occind2detind,
int inst_size,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int* __restrict__ inst_pred);
void nearest_assign_forward(
const at::Tensor _occ_pred, // (200, 200, 16)
const at::Tensor _l2s_key, // (l2s_size, 1)
const at::Tensor _occind2detind, // (10, 1)
const at::Tensor _inst_cls, // (inst_size, 1)
const at::Tensor _inst_xyz, // (inst_size, 3)
const at::Tensor _inst_id_list, // (inst_size, 1)
at::Tensor _inst_pred // (200, 200, 16)
) {
int l2s_size = _l2s_key.size(0);
int inst_size = _inst_xyz.size(0);
const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_occ_pred));
const int* occ_pred = _occ_pred.data_ptr<int>();
const int* inst_xyz = _inst_xyz.data_ptr<int>();
const int* inst_cls = _inst_cls.data_ptr<int>();
const int* l2s_key = _l2s_key.data_ptr<int>();
const int* inst_id_list = _inst_id_list.data_ptr<int>();
const int* occind2detind = _occind2detind.data_ptr<int>();
// std::map<int, int> l2s;
// for (int l2s_ind = 0; l2s_ind < l2s_size; l2s_ind++){
// l2s.insert(pair<int, int>(l2s_key[l2s_ind], l2s_val[l2s_ind]));
// }
int* inst_pred = _inst_pred.data_ptr<int>();
nearest_assign(
l2s_key,
l2s_size,
occind2detind,
inst_size,
occ_pred,
inst_xyz,
inst_cls,
inst_id_list,
inst_pred
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nearest_assign_forward", &nearest_assign_forward,
"nearest_assign_forward");
}
from setuptools import find_packages, setup
import os
import shutil
import sys
import torch
import warnings
from os import path as osp
from torch.utils.cpp_extension import (BuildExtension, CppExtension,
CUDAExtension)
def make_cuda_ext(name,
module,
sources,
sources_cuda=[],
extra_args=[],
extra_include_path=[]):
define_macros = []
extra_compile_args = {'cxx': [] + extra_args}
if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
define_macros += [('WITH_CUDA', None)]
extension = CUDAExtension
extra_compile_args['nvcc'] = extra_args + [
'-D__CUDA_NO_HALF_OPERATORS__',
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
]
sources += sources_cuda
else:
print('Compiling {} without CUDA'.format(name))
extension = CppExtension
# raise EnvironmentError('CUDA is required to compile MMDetection!')
return extension(
name='{}.{}'.format(module, name),
sources=[os.path.join(*module.split('.'), p) for p in sources],
include_dirs=extra_include_path,
define_macros=define_macros,
extra_compile_args=extra_compile_args)
if __name__ == '__main__':
setup(
name='flashocc_plugin',
description=("OpenMMLab's next-generation platform"
'for general 3D object detection.'),
long_description_content_type='text/markdown',
author='MMDetection3D Contributors',
author_email='zwwdev@gmail.com',
keywords='computer vision, 3D object detection',
url='https://github.com/open-mmlab/mmdetection3d',
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
license='Apache License 2.0',
ext_modules=[
make_cuda_ext(
name="bev_pool_ext",
module="mmdet3d_plugin.ops.bev_pool",
sources=[
"src/bev_pooling.cpp",
"src/bev_sum_pool.cpp",
"src/bev_sum_pool_cuda.cu",
"src/bev_max_pool.cpp",
"src/bev_max_pool_cuda.cu",
],
),
make_cuda_ext(
name="bev_pool_v2_ext",
module="mmdet3d_plugin.ops.bev_pool_v2",
sources=[
"src/bev_pool.cpp",
"src/bev_pool_cuda.cu"
],
),
make_cuda_ext(
name="nearest_assign_ext",
module="mmdet3d_plugin.ops.nearest_assign",
sources=[
"src/nearest_assign.cpp",
"src/nearest_assign_cuda.cu"
],
),
],
cmdclass={'build_ext': BuildExtension},
zip_safe=False)
import torch
from torch import tensor, device
import torch.fx as fx
from torch._dynamo.testing import rand_strided
from math import inf
import torch._inductor.inductor_prims
import torch._dynamo.config
import torch._inductor.config
import torch._functorch.config
import torch.fx.experimental._config
torch._dynamo.config.capture_scalar_outputs = True
isolate_fails_code_str = None
# torch version: 2.4.1
# torch cuda version: None
# torch git version: 45d303c9e4f41ec2f5450b6f60031246f67189d6
# CUDA Info:
# nvcc not found
# GPU Hardware Info:
# BW200 : 8
from torch.nn import *
class Repro(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, primals_1, primals_2, primals_4, primals_5, primals_6, primals_7, primals_8, primals_10, convert_element_type_1, clamp_max, convert_element_type_3, clamp_max_1, clamp_max_2, clamp_max_3, cat, convolution, squeeze_1, relu, convolution_1, getitem_3, rsqrt_1, convert_element_type_5, clamp_max_4, convert_element_type_7, clamp_max_5, clamp_max_6, clamp_max_7, add_19, convolution_2, squeeze_7, relu_2, unsqueeze_14, unsqueeze_38, tangents_1):
sum_1 = torch.ops.aten.sum.dim_IntList(tangents_1, [0, 2, 3])
convolution_backward = torch.ops.aten.convolution_backward.default(tangents_1, relu_2, primals_10, [256], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]); tangents_1 = primals_10 = None
getitem_6 = convolution_backward[0]
getitem_7 = convolution_backward[1]; convolution_backward = None
le = torch.ops.aten.le.Scalar(relu_2, 0); relu_2 = None
full_default = torch.ops.aten.full.default([], 0.0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
where = torch.ops.aten.where.self(le, full_default, getitem_6); le = getitem_6 = None
sum_2 = torch.ops.aten.sum.dim_IntList(where, [0, 2, 3])
sub_13 = torch.ops.aten.sub.Tensor(convolution_2, unsqueeze_14); convolution_2 = unsqueeze_14 = None
mul_31 = torch.ops.aten.mul.Tensor(where, sub_13)
sum_3 = torch.ops.aten.sum.dim_IntList(mul_31, [0, 2, 3]); mul_31 = None
mul_32 = torch.ops.aten.mul.Tensor(sum_2, 6.25e-06)
unsqueeze_15 = torch.ops.aten.unsqueeze.default(mul_32, 0); mul_32 = None
unsqueeze_16 = torch.ops.aten.unsqueeze.default(unsqueeze_15, 2); unsqueeze_15 = None
unsqueeze_17 = torch.ops.aten.unsqueeze.default(unsqueeze_16, 3); unsqueeze_16 = None
mul_33 = torch.ops.aten.mul.Tensor(sum_3, 6.25e-06)
mul_34 = torch.ops.aten.mul.Tensor(squeeze_7, squeeze_7)
mul_35 = torch.ops.aten.mul.Tensor(mul_33, mul_34); mul_33 = mul_34 = None
unsqueeze_18 = torch.ops.aten.unsqueeze.default(mul_35, 0); mul_35 = None
unsqueeze_19 = torch.ops.aten.unsqueeze.default(unsqueeze_18, 2); unsqueeze_18 = None
unsqueeze_20 = torch.ops.aten.unsqueeze.default(unsqueeze_19, 3); unsqueeze_19 = None
mul_36 = torch.ops.aten.mul.Tensor(squeeze_7, primals_8); primals_8 = None
unsqueeze_21 = torch.ops.aten.unsqueeze.default(mul_36, 0); mul_36 = None
unsqueeze_22 = torch.ops.aten.unsqueeze.default(unsqueeze_21, 2); unsqueeze_21 = None
unsqueeze_23 = torch.ops.aten.unsqueeze.default(unsqueeze_22, 3); unsqueeze_22 = None
mul_37 = torch.ops.aten.mul.Tensor(sub_13, unsqueeze_20); sub_13 = unsqueeze_20 = None
sub_15 = torch.ops.aten.sub.Tensor(where, mul_37); where = mul_37 = None
sub_16 = torch.ops.aten.sub.Tensor(sub_15, unsqueeze_17); sub_15 = unsqueeze_17 = None
mul_38 = torch.ops.aten.mul.Tensor(sub_16, unsqueeze_23); sub_16 = unsqueeze_23 = None
mul_39 = torch.ops.aten.mul.Tensor(sum_3, squeeze_7); sum_3 = squeeze_7 = None
convolution_backward_1 = torch.ops.aten.convolution_backward.default(mul_38, add_19, primals_7, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]); mul_38 = add_19 = primals_7 = None
getitem_9 = convolution_backward_1[0]
getitem_10 = convolution_backward_1[1]; convolution_backward_1 = None
mul_40 = torch.ops.aten.mul.Tensor(getitem_9, clamp_max_7); clamp_max_7 = None
neg = torch.ops.aten.neg.default(mul_40)
add_25 = torch.ops.aten.add.Tensor(getitem_9, neg); getitem_9 = neg = None
mul_41 = torch.ops.aten.mul.Tensor(mul_40, clamp_max_6)
neg_1 = torch.ops.aten.neg.default(mul_41)
add_26 = torch.ops.aten.add.Tensor(mul_40, neg_1); mul_40 = neg_1 = None
mul_42 = torch.ops.aten.mul.Tensor(add_25, clamp_max_6); clamp_max_6 = None
neg_2 = torch.ops.aten.neg.default(mul_42)
add_27 = torch.ops.aten.add.Tensor(add_25, neg_2); add_25 = neg_2 = None
full_default_1 = torch.ops.aten.full.default([4, 512, 100, 100], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
_unsafe_index_put = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, clamp_max_4, clamp_max_5], mul_41, True); mul_41 = None
_unsafe_index_put_1 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, clamp_max_4, convert_element_type_7], add_26, True); clamp_max_4 = add_26 = None
add_28 = torch.ops.aten.add.Tensor(_unsafe_index_put, _unsafe_index_put_1); _unsafe_index_put = _unsafe_index_put_1 = None
_unsafe_index_put_2 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, convert_element_type_5, clamp_max_5], mul_42, True); clamp_max_5 = mul_42 = None
add_29 = torch.ops.aten.add.Tensor(add_28, _unsafe_index_put_2); add_28 = _unsafe_index_put_2 = None
_unsafe_index_put_3 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, convert_element_type_5, convert_element_type_7], add_27, True); full_default_1 = convert_element_type_5 = convert_element_type_7 = add_27 = None
add_30 = torch.ops.aten.add.Tensor(add_29, _unsafe_index_put_3); add_29 = _unsafe_index_put_3 = None
sub_6 = torch.ops.aten.sub.Tensor(convolution_1, getitem_3)
mul_12 = torch.ops.aten.mul.Tensor(sub_6, rsqrt_1); sub_6 = None
unsqueeze_4 = torch.ops.aten.unsqueeze.default(primals_5, -1)
unsqueeze_5 = torch.ops.aten.unsqueeze.default(unsqueeze_4, -1); unsqueeze_4 = None
mul_18 = torch.ops.aten.mul.Tensor(mul_12, unsqueeze_5); mul_12 = unsqueeze_5 = None
unsqueeze_6 = torch.ops.aten.unsqueeze.default(primals_6, -1); primals_6 = None
unsqueeze_7 = torch.ops.aten.unsqueeze.default(unsqueeze_6, -1); unsqueeze_6 = None
add_14 = torch.ops.aten.add.Tensor(mul_18, unsqueeze_7); mul_18 = unsqueeze_7 = None
relu_1 = torch.ops.aten.relu.default(add_14); add_14 = None
le_1 = torch.ops.aten.le.Scalar(relu_1, 0); relu_1 = None
where_1 = torch.ops.aten.where.self(le_1, full_default, add_30); le_1 = add_30 = None
squeeze_3 = torch.ops.aten.squeeze.dims(getitem_3, [0, 2, 3]); getitem_3 = None
unsqueeze_24 = torch.ops.aten.unsqueeze.default(squeeze_3, 0); squeeze_3 = None
unsqueeze_25 = torch.ops.aten.unsqueeze.default(unsqueeze_24, 2); unsqueeze_24 = None
unsqueeze_26 = torch.ops.aten.unsqueeze.default(unsqueeze_25, 3); unsqueeze_25 = None
sum_4 = torch.ops.aten.sum.dim_IntList(where_1, [0, 2, 3])
sub_17 = torch.ops.aten.sub.Tensor(convolution_1, unsqueeze_26); convolution_1 = unsqueeze_26 = None
mul_43 = torch.ops.aten.mul.Tensor(where_1, sub_17)
sum_5 = torch.ops.aten.sum.dim_IntList(mul_43, [0, 2, 3]); mul_43 = None
mul_44 = torch.ops.aten.mul.Tensor(sum_4, 2.5e-05)
unsqueeze_27 = torch.ops.aten.unsqueeze.default(mul_44, 0); mul_44 = None
unsqueeze_28 = torch.ops.aten.unsqueeze.default(unsqueeze_27, 2); unsqueeze_27 = None
unsqueeze_29 = torch.ops.aten.unsqueeze.default(unsqueeze_28, 3); unsqueeze_28 = None
mul_45 = torch.ops.aten.mul.Tensor(sum_5, 2.5e-05)
squeeze_4 = torch.ops.aten.squeeze.dims(rsqrt_1, [0, 2, 3]); rsqrt_1 = None
mul_46 = torch.ops.aten.mul.Tensor(squeeze_4, squeeze_4)
mul_47 = torch.ops.aten.mul.Tensor(mul_45, mul_46); mul_45 = mul_46 = None
unsqueeze_30 = torch.ops.aten.unsqueeze.default(mul_47, 0); mul_47 = None
unsqueeze_31 = torch.ops.aten.unsqueeze.default(unsqueeze_30, 2); unsqueeze_30 = None
unsqueeze_32 = torch.ops.aten.unsqueeze.default(unsqueeze_31, 3); unsqueeze_31 = None
mul_48 = torch.ops.aten.mul.Tensor(squeeze_4, primals_5); primals_5 = None
unsqueeze_33 = torch.ops.aten.unsqueeze.default(mul_48, 0); mul_48 = None
unsqueeze_34 = torch.ops.aten.unsqueeze.default(unsqueeze_33, 2); unsqueeze_33 = None
unsqueeze_35 = torch.ops.aten.unsqueeze.default(unsqueeze_34, 3); unsqueeze_34 = None
mul_49 = torch.ops.aten.mul.Tensor(sub_17, unsqueeze_32); sub_17 = unsqueeze_32 = None
sub_19 = torch.ops.aten.sub.Tensor(where_1, mul_49); where_1 = mul_49 = None
sub_20 = torch.ops.aten.sub.Tensor(sub_19, unsqueeze_29); sub_19 = unsqueeze_29 = None
mul_50 = torch.ops.aten.mul.Tensor(sub_20, unsqueeze_35); sub_20 = unsqueeze_35 = None
mul_51 = torch.ops.aten.mul.Tensor(sum_5, squeeze_4); sum_5 = squeeze_4 = None
convolution_backward_2 = torch.ops.aten.convolution_backward.default(mul_50, relu, primals_4, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]); mul_50 = primals_4 = None
getitem_12 = convolution_backward_2[0]
getitem_13 = convolution_backward_2[1]; convolution_backward_2 = None
le_2 = torch.ops.aten.le.Scalar(relu, 0); relu = None
where_2 = torch.ops.aten.where.self(le_2, full_default, getitem_12); le_2 = full_default = getitem_12 = None
sum_6 = torch.ops.aten.sum.dim_IntList(where_2, [0, 2, 3])
sub_21 = torch.ops.aten.sub.Tensor(convolution, unsqueeze_38); convolution = unsqueeze_38 = None
mul_52 = torch.ops.aten.mul.Tensor(where_2, sub_21)
sum_7 = torch.ops.aten.sum.dim_IntList(mul_52, [0, 2, 3]); mul_52 = None
mul_53 = torch.ops.aten.mul.Tensor(sum_6, 2.5e-05)
unsqueeze_39 = torch.ops.aten.unsqueeze.default(mul_53, 0); mul_53 = None
unsqueeze_40 = torch.ops.aten.unsqueeze.default(unsqueeze_39, 2); unsqueeze_39 = None
unsqueeze_41 = torch.ops.aten.unsqueeze.default(unsqueeze_40, 3); unsqueeze_40 = None
mul_54 = torch.ops.aten.mul.Tensor(sum_7, 2.5e-05)
mul_55 = torch.ops.aten.mul.Tensor(squeeze_1, squeeze_1)
mul_56 = torch.ops.aten.mul.Tensor(mul_54, mul_55); mul_54 = mul_55 = None
unsqueeze_42 = torch.ops.aten.unsqueeze.default(mul_56, 0); mul_56 = None
unsqueeze_43 = torch.ops.aten.unsqueeze.default(unsqueeze_42, 2); unsqueeze_42 = None
unsqueeze_44 = torch.ops.aten.unsqueeze.default(unsqueeze_43, 3); unsqueeze_43 = None
mul_57 = torch.ops.aten.mul.Tensor(squeeze_1, primals_2); primals_2 = None
unsqueeze_45 = torch.ops.aten.unsqueeze.default(mul_57, 0); mul_57 = None
unsqueeze_46 = torch.ops.aten.unsqueeze.default(unsqueeze_45, 2); unsqueeze_45 = None
unsqueeze_47 = torch.ops.aten.unsqueeze.default(unsqueeze_46, 3); unsqueeze_46 = None
mul_58 = torch.ops.aten.mul.Tensor(sub_21, unsqueeze_44); sub_21 = unsqueeze_44 = None
sub_23 = torch.ops.aten.sub.Tensor(where_2, mul_58); where_2 = mul_58 = None
sub_24 = torch.ops.aten.sub.Tensor(sub_23, unsqueeze_41); sub_23 = unsqueeze_41 = None
mul_59 = torch.ops.aten.mul.Tensor(sub_24, unsqueeze_47); sub_24 = unsqueeze_47 = None
mul_60 = torch.ops.aten.mul.Tensor(sum_7, squeeze_1); sum_7 = squeeze_1 = None
convolution_backward_3 = torch.ops.aten.convolution_backward.default(mul_59, cat, primals_1, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]); mul_59 = cat = primals_1 = None
getitem_15 = convolution_backward_3[0]
getitem_16 = convolution_backward_3[1]; convolution_backward_3 = None
slice_1 = torch.ops.aten.slice.Tensor(getitem_15, 1, 0, 128)
slice_2 = torch.ops.aten.slice.Tensor(getitem_15, 1, 128, 640); getitem_15 = None
mul_61 = torch.ops.aten.mul.Tensor(slice_2, clamp_max_3); clamp_max_3 = None
neg_3 = torch.ops.aten.neg.default(mul_61)
add_31 = torch.ops.aten.add.Tensor(slice_2, neg_3); slice_2 = neg_3 = None
mul_62 = torch.ops.aten.mul.Tensor(mul_61, clamp_max_2)
neg_4 = torch.ops.aten.neg.default(mul_62)
add_32 = torch.ops.aten.add.Tensor(mul_61, neg_4); mul_61 = neg_4 = None
mul_63 = torch.ops.aten.mul.Tensor(add_31, clamp_max_2); clamp_max_2 = None
neg_5 = torch.ops.aten.neg.default(mul_63)
add_33 = torch.ops.aten.add.Tensor(add_31, neg_5); add_31 = neg_5 = None
full_default_7 = torch.ops.aten.full.default([4, 512, 25, 25], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
_unsafe_index_put_4 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, clamp_max, clamp_max_1], mul_62, True); mul_62 = None
_unsafe_index_put_5 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, clamp_max, convert_element_type_3], add_32, True); clamp_max = add_32 = None
add_34 = torch.ops.aten.add.Tensor(_unsafe_index_put_4, _unsafe_index_put_5); _unsafe_index_put_4 = _unsafe_index_put_5 = None
_unsafe_index_put_6 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, convert_element_type_1, clamp_max_1], mul_63, True); clamp_max_1 = mul_63 = None
add_35 = torch.ops.aten.add.Tensor(add_34, _unsafe_index_put_6); add_34 = _unsafe_index_put_6 = None
_unsafe_index_put_7 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, convert_element_type_1, convert_element_type_3], add_33, True); full_default_7 = convert_element_type_1 = convert_element_type_3 = add_33 = None
add_36 = torch.ops.aten.add.Tensor(add_35, _unsafe_index_put_7); add_35 = _unsafe_index_put_7 = None
return [getitem_16, mul_60, sum_6, getitem_13, mul_51, sum_4, getitem_10, mul_39, sum_2, getitem_7, sum_1, None, None, None, None, None, None, None, None, None, slice_1, add_36]
def load_args(reader):
buf0 = reader.storage('934c55e4a7a69a0a29a96cd8ef9f11c9859658e1', 11796480, device=device(type='cuda', index=2))
reader.tensor(buf0, (512, 640, 3, 3), requires_grad=True, is_leaf=True) # primals_1
buf1 = reader.storage('f12094f433480ec90280d223057708434df38941', 2048, device=device(type='cuda', index=2))
reader.tensor(buf1, (512,), requires_grad=True, is_leaf=True) # primals_2
buf2 = reader.storage('06c46ad2c91ec5c8eebc4fb0be80459bdfe007a8', 9437184, device=device(type='cuda', index=2))
reader.tensor(buf2, (512, 512, 3, 3), requires_grad=True, is_leaf=True) # primals_4
buf3 = reader.storage('aba0c4266c842d1845e720dc0c789942770a60b7', 2048, device=device(type='cuda', index=2))
reader.tensor(buf3, (512,), requires_grad=True, is_leaf=True) # primals_5
buf4 = reader.storage('bb8471d379e03c8ccb9897ce7d3a2dfbacb44e30', 2048, device=device(type='cuda', index=2))
reader.tensor(buf4, (512,), requires_grad=True, is_leaf=True) # primals_6
buf5 = reader.storage('b9484105fb5b2045fb6550a1edb77af72e639416', 4718592, device=device(type='cuda', index=2))
reader.tensor(buf5, (256, 512, 3, 3), requires_grad=True, is_leaf=True) # primals_7
buf6 = reader.storage('b778b8cab416c3fa6763b88e431266ae6ea28941', 1024, device=device(type='cuda', index=2))
reader.tensor(buf6, (256,), requires_grad=True, is_leaf=True) # primals_8
buf7 = reader.storage('c5f14ec72c73a593b47ef4aecf37f6bb25d2dec4', 262144, device=device(type='cuda', index=2))
reader.tensor(buf7, (256, 256, 1, 1), requires_grad=True, is_leaf=True) # primals_10
buf8 = reader.storage('99ef5c7086a924dfc5221c01ff1520de469849c8', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf8, (100, 1), dtype=torch.int64, is_leaf=True) # convert_element_type_1
buf9 = reader.storage('532b7b8fc19c48c7434e569ab96aa0670d5651ef', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf9, (100, 1), dtype=torch.int64, is_leaf=True) # clamp_max
buf10 = reader.storage('99ef5c7086a924dfc5221c01ff1520de469849c8', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf10, (100,), dtype=torch.int64, is_leaf=True) # convert_element_type_3
buf11 = reader.storage('532b7b8fc19c48c7434e569ab96aa0670d5651ef', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf11, (100,), dtype=torch.int64, is_leaf=True) # clamp_max_1
buf12 = reader.storage('0538ed039b8a4706a4f85bf431e12664d8940742', 400, device=device(type='cuda', index=2))
reader.tensor(buf12, (100,), is_leaf=True) # clamp_max_2
buf13 = reader.storage('0538ed039b8a4706a4f85bf431e12664d8940742', 400, device=device(type='cuda', index=2))
reader.tensor(buf13, (100, 1), is_leaf=True) # clamp_max_3
buf14 = reader.storage('5d41e66671a283b70001fd74345d8e7e3def00bd', 102400000, device=device(type='cuda', index=2))
reader.tensor(buf14, (4, 640, 100, 100), is_leaf=True) # cat
buf15 = reader.storage('a8fe0ed584571bb3218d663656459a36545be5e6', 81920000, device=device(type='cuda', index=2))
reader.tensor(buf15, (4, 512, 100, 100), is_leaf=True) # convolution
buf16 = reader.storage('0af13bcf109b8ca2df7f5ce3387d51e8576fb30a', 2048, device=device(type='cuda', index=2))
reader.tensor(buf16, (512,), is_leaf=True) # squeeze_1
buf17 = reader.storage('32f14d6fa07f654fbb09ef1563066303a3501eda', 81920000, device=device(type='cuda', index=2))
reader.tensor(buf17, (4, 512, 100, 100), is_leaf=True) # relu
buf18 = reader.storage('aca23d51e723ad9b4bec2e54d6f0af4b5b85cc7d', 81920000, device=device(type='cuda', index=2))
reader.tensor(buf18, (4, 512, 100, 100), is_leaf=True) # convolution_1
buf19 = reader.storage('4940c79e48676c2e1359870dc770e25cd780983d', 2048, device=device(type='cuda', index=2))
reader.tensor(buf19, (1, 512, 1, 1), is_leaf=True) # getitem_3
buf20 = reader.storage('d17407a9f45954a4d0d36e5b20a40ac554cc3aff', 2048, device=device(type='cuda', index=2))
reader.tensor(buf20, (1, 512, 1, 1), is_leaf=True) # rsqrt_1
buf21 = reader.storage('95fbd2b85e217ab78f8f9d7900b273a1362b3112', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf21, (200, 1), dtype=torch.int64, is_leaf=True) # convert_element_type_5
buf22 = reader.storage('d9920b87a7261c94c907bc68889b005f277cd597', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf22, (200, 1), dtype=torch.int64, is_leaf=True) # clamp_max_4
buf23 = reader.storage('95fbd2b85e217ab78f8f9d7900b273a1362b3112', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf23, (200,), dtype=torch.int64, is_leaf=True) # convert_element_type_7
buf24 = reader.storage('d9920b87a7261c94c907bc68889b005f277cd597', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf24, (200,), dtype=torch.int64, is_leaf=True) # clamp_max_5
buf25 = reader.storage('131d76cb798ee04745f0c7dcb67b63c74a6c00df', 800, device=device(type='cuda', index=2))
reader.tensor(buf25, (200,), is_leaf=True) # clamp_max_6
buf26 = reader.storage('131d76cb798ee04745f0c7dcb67b63c74a6c00df', 800, device=device(type='cuda', index=2))
reader.tensor(buf26, (200, 1), is_leaf=True) # clamp_max_7
buf27 = reader.storage('32194c54194bddd5f695a8d306828130629246fc', 327680000, device=device(type='cuda', index=2))
reader.tensor(buf27, (4, 512, 200, 200), is_leaf=True) # add_19
buf28 = reader.storage('e3a286ef8d6373c83ef30afe16eaae96ee52b965', 163840000, device=device(type='cuda', index=2))
reader.tensor(buf28, (4, 256, 200, 200), is_leaf=True) # convolution_2
buf29 = reader.storage('9572b289e6d5c9bdd20a79367d4005440da40795', 1024, device=device(type='cuda', index=2))
reader.tensor(buf29, (256,), is_leaf=True) # squeeze_7
buf30 = reader.storage('42f9ce794a05b12a40f15cbd4abb1201ccef0f72', 163840000, device=device(type='cuda', index=2))
reader.tensor(buf30, (4, 256, 200, 200), is_leaf=True) # relu_2
buf31 = reader.storage('61670207f087dc68f052bc03747d9ab365297b17', 1024, device=device(type='cuda', index=2))
reader.tensor(buf31, (1, 256, 1, 1), is_leaf=True) # unsqueeze_14
buf32 = reader.storage('ab77896e6dd76345e63586ecda30b1e4a63439cc', 2048, device=device(type='cuda', index=2))
reader.tensor(buf32, (1, 512, 1, 1), is_leaf=True) # unsqueeze_38
buf33 = reader.storage('f0ec623d2a44ff0f64fc264faf9128c2a6896e57', 163840000, device=device(type='cuda', index=2))
reader.tensor(buf33, (4, 256, 200, 200), is_leaf=True) # tangents_1
load_args._version = 0
mod = Repro()
if __name__ == '__main__':
from torch._dynamo.repro.after_aot import run_repro
with torch.no_grad():
run_repro(mod, load_args, accuracy=True, command='run', save_dir='/root/FlashOCC/torch_compile_debug/run_2025_08_24_19_42_28_279064-pid_182645/minifier/checkpoints', tracing_mode='real', check_str=None)
# To run it separately, do
# mod, args = run_repro(mod, load_args, accuracy=True, command='get_args', save_dir='/root/FlashOCC/torch_compile_debug/run_2025_08_24_19_42_28_279064-pid_182645/minifier/checkpoints', tracing_mode='real', check_str=None)
# mod(*args)
\ No newline at end of file
-r requirements/build.txt
-r requirements/optional.txt
-r requirements/runtime.txt
-r requirements/tests.txt
docutils==0.16.0
m2r
mistune==0.8.4
myst-parser
-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
sphinx==4.0.2
sphinx-copybutton
sphinx_markdown_tables
mmcv-full>=1.4.8,<=1.6.0
mmdet>=2.24.0,<=3.0.0
mmsegmentation>=0.20.0,<=1.0.0
open3d
spconv
waymo-open-dataset-tf-2-1-0==1.2.0
mmcv>=1.4.8
mmdet>=2.24.0
mmsegmentation>=0.20.1
torch
torchvision
lyft_dataset_sdk
networkx>=2.2,<2.3
numba==0.53.0
numpy
nuscenes-devkit
plyfile
scikit-image
# by default we also use tensorboard to log results
tensorboard
trimesh>=2.35.39,<2.35.40
asynctest
codecov
flake8
interrogate
isort
# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
kwarray
pytest
pytest-cov
pytest-runner
ubelt
xdoctest >= 0.10.0
yapf
118 ./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 2048 -k 25344 --alpha 1 --a_type f32_r --lda 25344 --b_type f32_r --ldb 25344 --beta 0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
2 ./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 960000 -k 512 --alpha 1 --lda 256 --ldb 512 --beta 0 --ldc 256
2 ./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 960000 -k 288 --alpha 1 --lda 512 --ldb 288 --beta 0 --ldc 512
2 ./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 960000 --alpha 1 --lda 256 --ldb 512 --beta 0 --ldc 256
2 ./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 288 -k 960000 --alpha 1 --lda 512 --ldb 288 --beta 0 --ldc 512
2 ./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 288 -n 960000 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 1 --ldc 288
2 ./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 512 -n 960000 -k 256 --alpha 1 --lda 256 --ldb 256 --beta 0 --ldc 512
948 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 11264 -n 256 -k 64 --alpha 1 --a_type f32_r --lda 11264 --stride_a 720896 --b_type f32_r --ldb 64 --stride_b 0 --beta 0 --c_type f32_r --ldc 11264 --stride_c 2883584 --d_type f32_r --ldd 11264 --stride_d 2883584 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
237 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 11264 -n 64 -k 64 --alpha 1 --a_type f32_r --lda 11264 --stride_a 720896 --b_type f32_r --ldb 64 --stride_b 0 --beta 0 --c_type f32_r --ldc 11264 --stride_c 720896 --d_type f32_r --ldd 11264 --stride_d 720896 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
948 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 2816 -n 512 -k 128 --alpha 1 --a_type f32_r --lda 2816 --stride_a 360448 --b_type f32_r --ldb 128 --stride_b 0 --beta 0 --c_type f32_r --ldc 2816 --stride_c 1441792 --d_type f32_r --ldd 2816 --stride_d 1441792 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
237 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 2816 -n 512 -k 256 --alpha 1 --a_type f32_r --lda 2816 --stride_a 720896 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 2816 --stride_c 1441792 --d_type f32_r --ldd 2816 --stride_d 1441792 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
119 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 40000 -n 256 -k 256 --alpha 1 --a_type f32_r --lda 40000 --stride_a 10240000 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 40000 --stride_c 10240000 --d_type f32_r --ldd 40000 --stride_d 10240000 --batch_count 24 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
119 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 704 -n 152 -k 256 --alpha 1 --a_type f32_r --lda 704 --stride_a 180224 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 704 --stride_c 107008 --d_type f32_r --ldd 704 --stride_d 107008 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 11264 -n 256 -k 128 --alpha 1 --a_type f32_r --lda 11264 --stride_a 1441792 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 11264 --stride_c 2883584 --d_type f32_r --ldd 11264 --stride_d 2883584 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
236 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 11264 -n 256 -k 64 --alpha 1 --a_type f32_r --lda 11264 --stride_a 720896 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 11264 --stride_c 2883584 --d_type f32_r --ldd 11264 --stride_d 2883584 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 2816 -n 256 -k 512 --alpha 1 --a_type f32_r --lda 2816 --stride_a 1441792 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 2816 --stride_c 720896 --d_type f32_r --ldd 2816 --stride_d 720896 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 704 -n 512 -k 1024 --alpha 1 --a_type f32_r --lda 704 --stride_a 720896 --b_type f32_r --ldb 512 --stride_b 0 --beta 0 --c_type f32_r --ldc 704 --stride_c 360448 --d_type f32_r --ldd 704 --stride_d 360448 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 512 -n 512 -k 9600 --alpha 1 --a_type f32_r --lda 9600 --stride_a 4915200 --b_type f32_r --ldb 9600 --stride_b 4915200 --beta 0 --c_type f32_r --ldc 512 --stride_c 262144 --d_type f32_r --ldd 512 --stride_d 262144 --batch_count 49 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 512 -n 640 -k 9600 --alpha 1 --a_type f32_r --lda 9600 --stride_a 4915200 --b_type f32_r --ldb 9600 --stride_b 6144000 --beta 0 --c_type f32_r --ldc 512 --stride_c 327680 --d_type f32_r --ldd 512 --stride_d 327680 --batch_count 49 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
2 ./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 3 -n 1 -k 3 --alpha 1 --lda 3 --stride_a 9 --ldb 3 --stride_b 3 --beta 0 --ldc 3 --stride_c 3 --batch_count 8921088
2 ./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 3 -n 3 -k 3 --alpha 1 --lda 3 --stride_a 9 --ldb 4 --stride_b 16 --beta 0 --ldc 3 --stride_c 9 --batch_count 144
4 ./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB T -m 3 -n 1 -k 3 --alpha 1 --lda 3 --stride_a 9 --ldb 1 --stride_b 3 --beta 0 --ldc 3 --stride_c 3 --batch_count 8921088
238 ./rocblas-bench -f gemm_strided_batched -r f64_r --transposeA N --transposeB N -m 4 -n 4 -k 4 --alpha 1 --lda 4 --stride_a 16 --ldb 4 --stride_b 16 --beta 0 --ldc 4 --stride_c 16 --batch_count 144
#!/bin/bash
export NCCL_TOPO_FILE=null
export NCCL_ALGO=Ring
export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
export PYTORCH_MIOPEN_SUGGEST_NHWC=1
export MIOPEN_PRECISION_FP32_FP32_FP32_TF32_FP32=1
export MIOPEN_FIND_MODE=1
export ROCBLAS_MATH_MODE=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export TORCHINDUCTOR_LAYOUT_OPTIMIZATION=1
export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
#export MIOPEN_ENABLE_LOGGING=1 # 打开MIOPEN LOGGING日志 default =0
#export MIOPEN_ENABLE_LOGGING_CMD=1 # 输出日志CMD信息 default =0
#export MIOPEN_LOG_LEVEL=6 # 设置日志打印level default=0
#export ROCBLAS_LAYER=3 # 打开 rocblas输出日志 default=0
TIME=$(date "+%Y-%m-%d_%H_%M")
MASTER_ADDR=${1:-localhost}
NNODES=${2:-1}
NODE_RANK=${3:-0}
CONFIG=${4:-projects/configs/flashocc/flashocc-r50.py}
bash tools/dist_train_numa.sh $MASTER_ADDR $NNODES $NODE_RANK $CONFIG \
2>&1 | tee cvm_bw1000_flashocc_${NNODES}nodes_$TIME.log
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
from collections import defaultdict
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
def cal_train_time(log_dicts, args):
for i, log_dict in enumerate(log_dicts):
print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
all_times = []
for epoch in log_dict.keys():
if args.include_outliers:
all_times.append(log_dict[epoch]['time'])
else:
all_times.append(log_dict[epoch]['time'][1:])
all_times = np.array(all_times)
epoch_ave_time = all_times.mean(-1)
slowest_epoch = epoch_ave_time.argmax()
fastest_epoch = epoch_ave_time.argmin()
std_over_epoch = epoch_ave_time.std()
print(f'slowest epoch {slowest_epoch + 1}, '
f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
print(f'fastest epoch {fastest_epoch + 1}, '
f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
print(f'time std over epochs is {std_over_epoch:.4f}')
print(f'average iter time: {np.mean(all_times):.4f} s/iter')
print()
def plot_curve(log_dicts, args):
if args.backend is not None:
plt.switch_backend(args.backend)
sns.set_style(args.style)
# if legend is None, use {filename}_{key} as legend
legend = args.legend
if legend is None:
legend = []
for json_log in args.json_logs:
for metric in args.keys:
legend.append(f'{json_log}_{metric}')
assert len(legend) == (len(args.json_logs) * len(args.keys))
metrics = args.keys
num_metrics = len(metrics)
for i, log_dict in enumerate(log_dicts):
epochs = list(log_dict.keys())
for j, metric in enumerate(metrics):
print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
if metric not in log_dict[epochs[args.interval - 1]]:
raise KeyError(
f'{args.json_logs[i]} does not contain metric {metric}')
if args.mode == 'eval':
if min(epochs) == args.interval:
x0 = args.interval
else:
# if current training is resumed from previous checkpoint
# we lost information in early epochs
# `xs` should start according to `min(epochs)`
if min(epochs) % args.interval == 0:
x0 = min(epochs)
else:
# find the first epoch that do eval
x0 = min(epochs) + args.interval - \
min(epochs) % args.interval
xs = np.arange(x0, max(epochs) + 1, args.interval)
ys = []
for epoch in epochs[args.interval - 1::args.interval]:
ys += log_dict[epoch][metric]
# if training is aborted before eval of the last epoch
# `xs` and `ys` will have different length and cause an error
# check if `ys[-1]` is empty here
if not log_dict[epoch][metric]:
xs = xs[:-1]
ax = plt.gca()
ax.set_xticks(xs)
plt.xlabel('epoch')
plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
else:
xs = []
ys = []
num_iters_per_epoch = \
log_dict[epochs[args.interval-1]]['iter'][-1]
for epoch in epochs[args.interval - 1::args.interval]:
iters = log_dict[epoch]['iter']
if log_dict[epoch]['mode'][-1] == 'val':
iters = iters[:-1]
xs.append(
np.array(iters) + (epoch - 1) * num_iters_per_epoch)
ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
xs = np.concatenate(xs)
ys = np.concatenate(ys)
plt.xlabel('iter')
plt.plot(
xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
plt.legend()
if args.title is not None:
plt.title(args.title)
if args.out is None:
plt.show()
else:
print(f'save curve to: {args.out}')
plt.savefig(args.out)
plt.cla()
def add_plot_parser(subparsers):
parser_plt = subparsers.add_parser(
'plot_curve', help='parser for plotting curves')
parser_plt.add_argument(
'json_logs',
type=str,
nargs='+',
help='path of train log in json format')
parser_plt.add_argument(
'--keys',
type=str,
nargs='+',
default=['mAP_0.25'],
help='the metric that you want to plot')
parser_plt.add_argument('--title', type=str, help='title of figure')
parser_plt.add_argument(
'--legend',
type=str,
nargs='+',
default=None,
help='legend of each plot')
parser_plt.add_argument(
'--backend', type=str, default=None, help='backend of plt')
parser_plt.add_argument(
'--style', type=str, default='dark', help='style of plt')
parser_plt.add_argument('--out', type=str, default=None)
parser_plt.add_argument('--mode', type=str, default='train')
parser_plt.add_argument('--interval', type=int, default=1)
def add_time_parser(subparsers):
parser_time = subparsers.add_parser(
'cal_train_time',
help='parser for computing the average time per training iteration')
parser_time.add_argument(
'json_logs',
type=str,
nargs='+',
help='path of train log in json format')
parser_time.add_argument(
'--include-outliers',
action='store_true',
help='include the first value of every epoch when computing '
'the average time')
def parse_args():
parser = argparse.ArgumentParser(description='Analyze Json Log')
# currently only support plot curve and calculate average train time
subparsers = parser.add_subparsers(dest='task', help='task parser')
add_plot_parser(subparsers)
add_time_parser(subparsers)
args = parser.parse_args()
return args
def load_json_logs(json_logs):
# load and convert json_logs to log_dict, key is epoch, value is a sub dict
# keys of sub dict is different metrics, e.g. memory, bbox_mAP
# value of sub dict is a list of corresponding values of all iterations
log_dicts = [dict() for _ in json_logs]
for json_log, log_dict in zip(json_logs, log_dicts):
with open(json_log, 'r') as log_file:
for line in log_file:
log = json.loads(line.strip())
# skip lines without `epoch` field
if 'epoch' not in log:
continue
epoch = log.pop('epoch')
if epoch not in log_dict:
log_dict[epoch] = defaultdict(list)
for k, v in log.items():
log_dict[epoch][k].append(v)
return log_dicts
def main():
args = parse_args()
json_logs = args.json_logs
for json_log in json_logs:
assert json_log.endswith('.json')
log_dicts = load_json_logs(json_logs)
eval(args.task)(log_dicts, args)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment