Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
TS-MODELS-OPT
training
Autonomous-Driving-models
Commits
d2b71343
Commit
d2b71343
authored
Apr 08, 2026
by
雍大凯
Browse files
add code
parent
69e57885
Changes
259
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4717 additions
and
0 deletions
+4717
-0
docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.cu
docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.cu
+747
-0
docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.hip
docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.hip
+749
-0
docker-hub/FlashOCC/Flashocc/mmdetection3d
docker-hub/FlashOCC/Flashocc/mmdetection3d
+1
-0
docker-hub/FlashOCC/Flashocc/projects/__init__.py
docker-hub/FlashOCC/Flashocc/projects/__init__.py
+0
-0
docker-hub/FlashOCC/Flashocc/projects/__pycache__/__init__.cpython-310.pyc
...CC/Flashocc/projects/__pycache__/__init__.cpython-310.pyc
+0
-0
docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
...c/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
+265
-0
docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50.py
...CC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50.py
+270
-0
docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
...onfigs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
+323
-0
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-4d-stereo.py
...shocc/projects/configs/flashocc/flashocc-r50-4d-stereo.py
+260
-0
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0-trt.py
...Flashocc/projects/configs/flashocc/flashocc-r50-M0-trt.py
+7
-0
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0.py
...OCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0.py
+252
-0
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-trt.py
...CC/Flashocc/projects/configs/flashocc/flashocc-r50-trt.py
+7
-0
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50.py
...ashOCC/Flashocc/projects/configs/flashocc/flashocc-r50.py
+268
-0
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
...s/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
+260
-0
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
...s/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
+302
-0
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
...igs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
+356
-0
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
...tic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
+7
-0
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
...anoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
+357
-0
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
...igs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
+279
-0
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
...figs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
+7
-0
No files found.
docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.cu
0 → 100644
View file @
d2b71343
// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
// Modified by Haisong Liu
#include <torch/extension.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <string>
#include <iostream>
#define MAX_D 1446 // 700 + 700 + 45 + 1
#define MAX_STEP 1000
enum
LossType
{
L1
,
L2
,
ABSREL
};
enum
PhaseName
{
TEST
,
TRAIN
};
template
<
typename
scalar_t
>
__global__
void
init_cuda_kernel
(
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
points
,
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
tindex
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
5
,
torch
::
RestrictPtrTraits
>
occupancy
)
{
// batch index
const
auto
n
=
blockIdx
.
y
;
// ray index
const
auto
c
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// num of rays
const
auto
M
=
points
.
size
(
1
);
const
auto
T
=
occupancy
.
size
(
1
);
// we allocated more threads than num_rays
if
(
c
<
M
)
{
// ray end point
const
auto
t
=
tindex
[
n
][
c
];
// invalid points
assert
(
T
==
1
||
t
<
T
);
// if t < 0, it is a padded point
if
(
t
<
0
)
return
;
// time index for sigma
// when T = 1, we have a static sigma
const
auto
ts
=
(
T
==
1
)
?
0
:
t
;
// grid shape
const
int
vzsize
=
occupancy
.
size
(
2
);
const
int
vysize
=
occupancy
.
size
(
3
);
const
int
vxsize
=
occupancy
.
size
(
4
);
// assert(vzsize + vysize + vxsize <= MAX_D);
// end point
const
int
vx
=
int
(
points
[
n
][
c
][
0
]);
const
int
vy
=
int
(
points
[
n
][
c
][
1
]);
const
int
vz
=
int
(
points
[
n
][
c
][
2
]);
//
if
(
0
<=
vx
&&
vx
<
vxsize
&&
0
<=
vy
&&
vy
<
vysize
&&
0
<=
vz
&&
vz
<
vzsize
)
{
occupancy
[
n
][
ts
][
vz
][
vy
][
vx
]
=
1
;
}
}
}
template
<
typename
scalar_t
>
__global__
void
render_forward_cuda_kernel
(
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
5
,
torch
::
RestrictPtrTraits
>
sigma
,
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
origin
,
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
points
,
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
tindex
,
// torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> pog,
torch
::
PackedTensorAccessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
pred_dist
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
gt_dist
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
coord_index
,
PhaseName
train_phase
)
{
// batch index
const
auto
n
=
blockIdx
.
y
;
// ray index
const
auto
c
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// num of rays
const
auto
M
=
points
.
size
(
1
);
const
auto
T
=
sigma
.
size
(
1
);
// we allocated more threads than num_rays
if
(
c
<
M
)
{
// ray end point
const
auto
t
=
tindex
[
n
][
c
];
// invalid points
// assert(t < T);
assert
(
T
==
1
||
t
<
T
);
// time index for sigma
// when T = 1, we have a static sigma
const
auto
ts
=
(
T
==
1
)
?
0
:
t
;
// if t < 0, it is a padded point
if
(
t
<
0
)
return
;
// grid shape
const
int
vzsize
=
sigma
.
size
(
2
);
const
int
vysize
=
sigma
.
size
(
3
);
const
int
vxsize
=
sigma
.
size
(
4
);
// assert(vzsize + vysize + vxsize <= MAX_D);
// origin
const
double
xo
=
origin
[
n
][
t
][
0
];
const
double
yo
=
origin
[
n
][
t
][
1
];
const
double
zo
=
origin
[
n
][
t
][
2
];
// end point
const
double
xe
=
points
[
n
][
c
][
0
];
const
double
ye
=
points
[
n
][
c
][
1
];
const
double
ze
=
points
[
n
][
c
][
2
];
// locate the voxel where the origin resides
const
int
vxo
=
int
(
xo
);
const
int
vyo
=
int
(
yo
);
const
int
vzo
=
int
(
zo
);
const
int
vxe
=
int
(
xe
);
const
int
vye
=
int
(
ye
);
const
int
vze
=
int
(
ze
);
// NOTE: new
int
vx
=
vxo
;
int
vy
=
vyo
;
int
vz
=
vzo
;
// origin to end
const
double
rx
=
xe
-
xo
;
const
double
ry
=
ye
-
yo
;
const
double
rz
=
ze
-
zo
;
double
gt_d
=
sqrt
(
rx
*
rx
+
ry
*
ry
+
rz
*
rz
);
// directional vector
const
double
dx
=
rx
/
gt_d
;
const
double
dy
=
ry
/
gt_d
;
const
double
dz
=
rz
/
gt_d
;
// In which direction the voxel ids are incremented.
const
int
stepX
=
(
dx
>=
0
)
?
1
:
-
1
;
const
int
stepY
=
(
dy
>=
0
)
?
1
:
-
1
;
const
int
stepZ
=
(
dz
>=
0
)
?
1
:
-
1
;
// Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
const
double
next_voxel_boundary_x
=
vx
+
(
stepX
<
0
?
0
:
1
);
const
double
next_voxel_boundary_y
=
vy
+
(
stepY
<
0
?
0
:
1
);
const
double
next_voxel_boundary_z
=
vz
+
(
stepZ
<
0
?
0
:
1
);
// tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
// the value of t at which the ray crosses the first vertical voxel boundary
double
tMaxX
=
(
dx
!=
0
)
?
(
next_voxel_boundary_x
-
xo
)
/
dx
:
DBL_MAX
;
//
double
tMaxY
=
(
dy
!=
0
)
?
(
next_voxel_boundary_y
-
yo
)
/
dy
:
DBL_MAX
;
//
double
tMaxZ
=
(
dz
!=
0
)
?
(
next_voxel_boundary_z
-
zo
)
/
dz
:
DBL_MAX
;
//
// tDeltaX, tDeltaY, tDeltaZ --
// how far along the ray we must move for the horizontal component to equal the width of a voxel
// the direction in which we traverse the grid
// can only be FLT_MAX if we never go in that direction
const
double
tDeltaX
=
(
dx
!=
0
)
?
stepX
/
dx
:
DBL_MAX
;
const
double
tDeltaY
=
(
dy
!=
0
)
?
stepY
/
dy
:
DBL_MAX
;
const
double
tDeltaZ
=
(
dz
!=
0
)
?
stepZ
/
dz
:
DBL_MAX
;
int3
path
[
MAX_D
];
double
csd
[
MAX_D
];
// cumulative sum of sigma times delta
double
p
[
MAX_D
];
// alpha
double
d
[
MAX_D
];
// forward raymarching with voxel traversal
int
step
=
0
;
// total number of voxels traversed
int
count
=
0
;
// number of voxels traversed inside the voxel grid
double
last_d
=
0.0
;
// correct initialization
// voxel traversal raycasting
bool
was_inside
=
false
;
while
(
true
)
{
bool
inside
=
(
0
<=
vx
&&
vx
<
vxsize
)
&&
(
0
<=
vy
&&
vy
<
vysize
)
&&
(
0
<=
vz
&&
vz
<
vzsize
);
if
(
inside
)
{
was_inside
=
true
;
path
[
count
]
=
make_int3
(
vx
,
vy
,
vz
);
}
else
if
(
was_inside
)
{
// was but no longer inside
// we know we are not coming back so terminate
break
;
}
/*else if (last_d > gt_d) {
break;
} */
/*else { // has not gone inside yet
// assert(count == 0);
// (1) when we have hit the destination but haven't gone inside the voxel grid
// (2) when we have traveled MAX_D voxels but haven't found one valid voxel
// handle intersection corner cases in case of infinite loop
bool hit = (vx == vxe && vy == vye && vz == vze); // this test seems brittle with corner cases
if (hit || step >= MAX_D)
break;
//if (last_d >= gt_d || step >= MAX_D) break;
} */
// _d represents the ray distance has traveled before escaping the current voxel cell
double
_d
=
0.0
;
// voxel traversal
if
(
tMaxX
<
tMaxY
)
{
if
(
tMaxX
<
tMaxZ
)
{
_d
=
tMaxX
;
vx
+=
stepX
;
tMaxX
+=
tDeltaX
;
}
else
{
_d
=
tMaxZ
;
vz
+=
stepZ
;
tMaxZ
+=
tDeltaZ
;
}
}
else
{
if
(
tMaxY
<
tMaxZ
)
{
_d
=
tMaxY
;
vy
+=
stepY
;
tMaxY
+=
tDeltaY
;
}
else
{
_d
=
tMaxZ
;
vz
+=
stepZ
;
tMaxZ
+=
tDeltaZ
;
}
}
if
(
inside
)
{
// get sigma at the current voxel
const
int3
&
v
=
path
[
count
];
// use the recorded index
const
double
_sigma
=
sigma
[
n
][
ts
][
v
.
z
][
v
.
y
][
v
.
x
];
const
double
_delta
=
max
(
0.0
,
_d
-
last_d
);
// THIS TURNS OUT IMPORTANT
const
double
sd
=
_sigma
*
_delta
;
if
(
count
==
0
)
{
// the first voxel inside
csd
[
count
]
=
sd
;
p
[
count
]
=
1
-
exp
(
-
sd
);
}
else
{
csd
[
count
]
=
csd
[
count
-
1
]
+
sd
;
p
[
count
]
=
exp
(
-
csd
[
count
-
1
])
-
exp
(
-
csd
[
count
]);
}
// record the traveled distance
d
[
count
]
=
_d
;
// count the number of voxels we have escaped
count
++
;
}
last_d
=
_d
;
step
++
;
if
(
step
>
MAX_STEP
)
{
break
;
}
}
// the total number of voxels visited should not exceed this number
assert
(
count
<=
MAX_D
);
if
(
count
>
0
)
{
// compute the expected ray distance
//double exp_d = 0.0;
double
exp_d
=
d
[
count
-
1
];
const
int3
&
v_init
=
path
[
count
-
1
];
int
x
=
v_init
.
x
;
int
y
=
v_init
.
y
;
int
z
=
v_init
.
z
;
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
//printf("%f\t%f\n",p[i], d[i]);
//exp_d += p[i] * d[i];
const
int3
&
v
=
path
[
i
];
const
double
occ
=
sigma
[
n
][
ts
][
v
.
z
][
v
.
y
][
v
.
x
];
if
(
occ
>
0.5
)
{
exp_d
=
d
[
i
];
x
=
v
.
x
;
y
=
v
.
y
;
z
=
v
.
z
;
break
;
}
}
//printf("%f\n",exp_d);
// add an imaginary sample at the end point should gt_d exceeds max_d
double
p_out
=
exp
(
-
csd
[
count
-
1
]);
double
max_d
=
d
[
count
-
1
];
// if (gt_d > max_d)
// exp_d += (p_out * gt_d);
// p_out is the probability the ray escapes the voxel grid
//exp_d += (p_out * max_d);
if
(
train_phase
==
1
)
{
gt_d
=
min
(
gt_d
,
max_d
);
}
// write the rendered ray distance (max_d)
pred_dist
[
n
][
c
]
=
exp_d
;
gt_dist
[
n
][
c
]
=
gt_d
;
coord_index
[
n
][
c
][
0
]
=
double
(
x
);
coord_index
[
n
][
c
][
1
]
=
double
(
y
);
coord_index
[
n
][
c
][
2
]
=
double
(
z
);
// // write occupancy
// for (int i = 0; i < count; i ++) {
// const int3 &v = path[i];
// auto & occ = pog[n][t][v.z][v.y][v.x];
// if (p[i] >= occ) {
// occ = p[i];
// }
// }
}
}
}
/*
* input shape
* sigma : N x T x H x L x W
* origin : N x T x 3
* points : N x M x 4
* output shape
* dist : N x M
*/
std
::
vector
<
torch
::
Tensor
>
render_forward_cuda
(
torch
::
Tensor
sigma
,
torch
::
Tensor
origin
,
torch
::
Tensor
points
,
torch
::
Tensor
tindex
,
const
std
::
vector
<
int
>
grid
,
std
::
string
phase_name
)
{
const
auto
N
=
points
.
size
(
0
);
// batch size
const
auto
M
=
points
.
size
(
1
);
// num of rays
const
auto
T
=
grid
[
0
];
const
auto
H
=
grid
[
1
];
const
auto
L
=
grid
[
2
];
const
auto
W
=
grid
[
3
];
const
auto
device
=
sigma
.
device
();
const
int
threads
=
1024
;
const
dim3
blocks
((
M
+
threads
-
1
)
/
threads
,
N
);
//
// const auto dtype = points.dtype();
// const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
// auto pog = torch::zeros({N, T, H, L, W}, options);
// perform rendering
auto
gt_dist
=
-
torch
::
ones
({
N
,
M
},
device
);
auto
pred_dist
=
-
torch
::
ones
({
N
,
M
},
device
);
auto
coord_index
=
torch
::
zeros
({
N
,
M
,
3
},
device
);
PhaseName
train_phase
;
if
(
phase_name
.
compare
(
"test"
)
==
0
)
{
train_phase
=
TEST
;
}
else
if
(
phase_name
.
compare
(
"train"
)
==
0
){
train_phase
=
TRAIN
;
}
else
{
std
::
cout
<<
"UNKNOWN PHASE NAME: "
<<
phase_name
<<
std
::
endl
;
exit
(
1
);
}
AT_DISPATCH_FLOATING_TYPES
(
sigma
.
type
(),
"render_forward_cuda"
,
([
&
]
{
render_forward_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
>>>
(
sigma
.
packed_accessor32
<
scalar_t
,
5
,
torch
::
RestrictPtrTraits
>
(),
origin
.
packed_accessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
(),
points
.
packed_accessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
(),
tindex
.
packed_accessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
(),
// pog.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
pred_dist
.
packed_accessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
(),
gt_dist
.
packed_accessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
(),
coord_index
.
packed_accessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
(),
train_phase
);
}));
cudaDeviceSynchronize
();
// return {pog, pred_dist, gt_dist};
return
{
pred_dist
,
gt_dist
,
coord_index
};
}
template
<
typename
scalar_t
>
__global__
void
render_cuda_kernel
(
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
5
,
torch
::
RestrictPtrTraits
>
sigma
,
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
origin
,
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
points
,
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
tindex
,
// const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy,
torch
::
PackedTensorAccessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
pred_dist
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
gt_dist
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
5
,
torch
::
RestrictPtrTraits
>
grad_sigma
,
// torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma_count,
LossType
loss_type
)
{
// batch index
const
auto
n
=
blockIdx
.
y
;
// ray index
const
auto
c
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// num of rays
const
auto
M
=
points
.
size
(
1
);
const
auto
T
=
sigma
.
size
(
1
);
// we allocated more threads than num_rays
if
(
c
<
M
)
{
// ray end point
const
auto
t
=
tindex
[
n
][
c
];
// invalid points
// assert(t < T);
assert
(
T
==
1
||
t
<
T
);
// time index for sigma
// when T = 1, we have a static sigma
const
auto
ts
=
(
T
==
1
)
?
0
:
t
;
// if t < 0, it is a padded point
if
(
t
<
0
)
return
;
// grid shape
const
int
vzsize
=
sigma
.
size
(
2
);
const
int
vysize
=
sigma
.
size
(
3
);
const
int
vxsize
=
sigma
.
size
(
4
);
// assert(vzsize + vysize + vxsize <= MAX_D);
// origin
const
double
xo
=
origin
[
n
][
t
][
0
];
const
double
yo
=
origin
[
n
][
t
][
1
];
const
double
zo
=
origin
[
n
][
t
][
2
];
// end point
const
double
xe
=
points
[
n
][
c
][
0
];
const
double
ye
=
points
[
n
][
c
][
1
];
const
double
ze
=
points
[
n
][
c
][
2
];
// locate the voxel where the origin resides
const
int
vxo
=
int
(
xo
);
const
int
vyo
=
int
(
yo
);
const
int
vzo
=
int
(
zo
);
//
const
int
vxe
=
int
(
xe
);
const
int
vye
=
int
(
ye
);
const
int
vze
=
int
(
ze
);
// NOTE: new
int
vx
=
vxo
;
int
vy
=
vyo
;
int
vz
=
vzo
;
// origin to end
const
double
rx
=
xe
-
xo
;
const
double
ry
=
ye
-
yo
;
const
double
rz
=
ze
-
zo
;
double
gt_d
=
sqrt
(
rx
*
rx
+
ry
*
ry
+
rz
*
rz
);
// directional vector
const
double
dx
=
rx
/
gt_d
;
const
double
dy
=
ry
/
gt_d
;
const
double
dz
=
rz
/
gt_d
;
// In which direction the voxel ids are incremented.
const
int
stepX
=
(
dx
>=
0
)
?
1
:
-
1
;
const
int
stepY
=
(
dy
>=
0
)
?
1
:
-
1
;
const
int
stepZ
=
(
dz
>=
0
)
?
1
:
-
1
;
// Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
const
double
next_voxel_boundary_x
=
vx
+
(
stepX
<
0
?
0
:
1
);
const
double
next_voxel_boundary_y
=
vy
+
(
stepY
<
0
?
0
:
1
);
const
double
next_voxel_boundary_z
=
vz
+
(
stepZ
<
0
?
0
:
1
);
// tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
// the value of t at which the ray crosses the first vertical voxel boundary
double
tMaxX
=
(
dx
!=
0
)
?
(
next_voxel_boundary_x
-
xo
)
/
dx
:
DBL_MAX
;
//
double
tMaxY
=
(
dy
!=
0
)
?
(
next_voxel_boundary_y
-
yo
)
/
dy
:
DBL_MAX
;
//
double
tMaxZ
=
(
dz
!=
0
)
?
(
next_voxel_boundary_z
-
zo
)
/
dz
:
DBL_MAX
;
//
// tDeltaX, tDeltaY, tDeltaZ --
// how far along the ray we must move for the horizontal component to equal the width of a voxel
// the direction in which we traverse the grid
// can only be FLT_MAX if we never go in that direction
const
double
tDeltaX
=
(
dx
!=
0
)
?
stepX
/
dx
:
DBL_MAX
;
const
double
tDeltaY
=
(
dy
!=
0
)
?
stepY
/
dy
:
DBL_MAX
;
const
double
tDeltaZ
=
(
dz
!=
0
)
?
stepZ
/
dz
:
DBL_MAX
;
int3
path
[
MAX_D
];
double
csd
[
MAX_D
];
// cumulative sum of sigma times delta
double
p
[
MAX_D
];
// alpha
double
d
[
MAX_D
];
double
dt
[
MAX_D
];
// forward raymarching with voxel traversal
int
step
=
0
;
// total number of voxels traversed
int
count
=
0
;
// number of voxels traversed inside the voxel grid
double
last_d
=
0.0
;
// correct initialization
// voxel traversal raycasting
bool
was_inside
=
false
;
while
(
true
)
{
bool
inside
=
(
0
<=
vx
&&
vx
<
vxsize
)
&&
(
0
<=
vy
&&
vy
<
vysize
)
&&
(
0
<=
vz
&&
vz
<
vzsize
);
if
(
inside
)
{
// now inside
was_inside
=
true
;
path
[
count
]
=
make_int3
(
vx
,
vy
,
vz
);
}
else
if
(
was_inside
)
{
// was inside but no longer
// we know we are not coming back so terminate
break
;
}
else
if
(
last_d
>
gt_d
)
{
break
;
}
/* else { // has not gone inside yet
// assert(count == 0);
// (1) when we have hit the destination but haven't gone inside the voxel grid
// (2) when we have traveled MAX_D voxels but haven't found one valid voxel
// handle intersection corner cases in case of infinite loop
// bool hit = (vx == vxe && vy == vye && vz == vze);
// if (hit || step >= MAX_D)
// break;
if (last_d >= gt_d || step >= MAX_D) break;
} */
// _d represents the ray distance has traveled before escaping the current voxel cell
double
_d
=
0.0
;
// voxel traversal
if
(
tMaxX
<
tMaxY
)
{
if
(
tMaxX
<
tMaxZ
)
{
_d
=
tMaxX
;
vx
+=
stepX
;
tMaxX
+=
tDeltaX
;
}
else
{
_d
=
tMaxZ
;
vz
+=
stepZ
;
tMaxZ
+=
tDeltaZ
;
}
}
else
{
if
(
tMaxY
<
tMaxZ
)
{
_d
=
tMaxY
;
vy
+=
stepY
;
tMaxY
+=
tDeltaY
;
}
else
{
_d
=
tMaxZ
;
vz
+=
stepZ
;
tMaxZ
+=
tDeltaZ
;
}
}
if
(
inside
)
{
// get sigma at the current voxel
const
int3
&
v
=
path
[
count
];
// use the recorded index
const
double
_sigma
=
sigma
[
n
][
ts
][
v
.
z
][
v
.
y
][
v
.
x
];
const
double
_delta
=
max
(
0.0
,
_d
-
last_d
);
// THIS TURNS OUT IMPORTANT
const
double
sd
=
_sigma
*
_delta
;
if
(
count
==
0
)
{
// the first voxel inside
csd
[
count
]
=
sd
;
p
[
count
]
=
1
-
exp
(
-
sd
);
}
else
{
csd
[
count
]
=
csd
[
count
-
1
]
+
sd
;
p
[
count
]
=
exp
(
-
csd
[
count
-
1
])
-
exp
(
-
csd
[
count
]);
}
// record the traveled distance
d
[
count
]
=
_d
;
dt
[
count
]
=
_delta
;
// count the number of voxels we have escaped
count
++
;
}
last_d
=
_d
;
step
++
;
if
(
step
>
MAX_STEP
)
{
break
;
}
}
// the total number of voxels visited should not exceed this number
assert
(
count
<=
MAX_D
);
// WHEN THERE IS AN INTERSECTION BETWEEN THE RAY AND THE VOXEL GRID
if
(
count
>
0
)
{
// compute the expected ray distance
double
exp_d
=
0.0
;
for
(
int
i
=
0
;
i
<
count
;
i
++
)
exp_d
+=
p
[
i
]
*
d
[
i
];
// add an imaginary sample at the end point should gt_d exceeds max_d
double
p_out
=
exp
(
-
csd
[
count
-
1
]);
double
max_d
=
d
[
count
-
1
];
exp_d
+=
(
p_out
*
max_d
);
gt_d
=
min
(
gt_d
,
max_d
);
// write the rendered ray distance (max_d)
pred_dist
[
n
][
c
]
=
exp_d
;
gt_dist
[
n
][
c
]
=
gt_d
;
/* backward raymarching */
double
dd_dsigma
[
MAX_D
];
for
(
int
i
=
count
-
1
;
i
>=
0
;
i
--
)
{
// NOTE: probably need to double check again
if
(
i
==
count
-
1
)
dd_dsigma
[
i
]
=
p_out
*
max_d
;
else
dd_dsigma
[
i
]
=
dd_dsigma
[
i
+
1
]
-
exp
(
-
csd
[
i
])
*
(
d
[
i
+
1
]
-
d
[
i
]);
}
for
(
int
i
=
count
-
1
;
i
>=
0
;
i
--
)
dd_dsigma
[
i
]
*=
dt
[
i
];
// option 2: cap at the boundary
for
(
int
i
=
count
-
1
;
i
>=
0
;
i
--
)
dd_dsigma
[
i
]
-=
dt
[
i
]
*
p_out
*
max_d
;
double
dl_dd
=
1.0
;
if
(
loss_type
==
L1
)
dl_dd
=
(
exp_d
>=
gt_d
)
?
1
:
-
1
;
else
if
(
loss_type
==
L2
)
dl_dd
=
(
exp_d
-
gt_d
);
else
if
(
loss_type
==
ABSREL
)
dl_dd
=
(
exp_d
>=
gt_d
)
?
(
1.0
/
gt_d
)
:
-
(
1.0
/
gt_d
);
// apply chain rule
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
const
int3
&
v
=
path
[
i
];
// NOTE: potential race conditions when writing gradients
grad_sigma
[
n
][
ts
][
v
.
z
][
v
.
y
][
v
.
x
]
+=
dl_dd
*
dd_dsigma
[
i
];
// grad_sigma_count[n][ts][v.z][v.y][v.x] += 1;
}
}
}
}
/*
* input shape
* sigma : N x T x H x L x W
* origin : N x T x 3
* points : N x M x 4
* output shape
* dist : N x M
* loss : N x M
* grad_sigma : N x T x H x L x W
*/
std
::
vector
<
torch
::
Tensor
>
render_cuda
(
torch
::
Tensor
sigma
,
torch
::
Tensor
origin
,
torch
::
Tensor
points
,
torch
::
Tensor
tindex
,
std
::
string
loss_name
)
{
const
auto
N
=
points
.
size
(
0
);
// batch size
const
auto
M
=
points
.
size
(
1
);
// num of rays
const
auto
device
=
sigma
.
device
();
const
int
threads
=
1024
;
const
dim3
blocks
((
M
+
threads
-
1
)
/
threads
,
N
);
// perform rendering
auto
gt_dist
=
-
torch
::
ones
({
N
,
M
},
device
);
auto
pred_dist
=
-
torch
::
ones
({
N
,
M
},
device
);
auto
grad_sigma
=
torch
::
zeros_like
(
sigma
);
// auto grad_sigma_count = torch::zeros_like(sigma);
LossType
loss_type
;
if
(
loss_name
.
compare
(
"l1"
)
==
0
)
{
loss_type
=
L1
;
}
else
if
(
loss_name
.
compare
(
"l2"
)
==
0
)
{
loss_type
=
L2
;
}
else
if
(
loss_name
.
compare
(
"absrel"
)
==
0
)
{
loss_type
=
ABSREL
;
}
else
if
(
loss_name
.
compare
(
"bce"
)
==
0
){
loss_type
=
L1
;
}
else
{
std
::
cout
<<
"UNKNOWN LOSS TYPE: "
<<
loss_name
<<
std
::
endl
;
exit
(
1
);
}
AT_DISPATCH_FLOATING_TYPES
(
sigma
.
type
(),
"render_cuda"
,
([
&
]
{
render_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
>>>
(
sigma
.
packed_accessor32
<
scalar_t
,
5
,
torch
::
RestrictPtrTraits
>
(),
origin
.
packed_accessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
(),
points
.
packed_accessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
(),
tindex
.
packed_accessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
(),
// occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
pred_dist
.
packed_accessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
(),
gt_dist
.
packed_accessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
(),
grad_sigma
.
packed_accessor32
<
scalar_t
,
5
,
torch
::
RestrictPtrTraits
>
(),
// grad_sigma_count.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
loss_type
);
}));
cudaDeviceSynchronize
();
// grad_sigma_count += (grad_sigma_count == 0);
// grad_sigma /= grad_sigma_count;
return
{
pred_dist
,
gt_dist
,
grad_sigma
};
}
/*
* input shape
* origin : N x T x 3
* points : N x M x 3
* tindex : N x M
* output shape
* occupancy: N x T x H x L x W
*/
torch
::
Tensor
init_cuda
(
torch
::
Tensor
points
,
torch
::
Tensor
tindex
,
const
std
::
vector
<
int
>
grid
)
{
const
auto
N
=
points
.
size
(
0
);
// batch size
const
auto
M
=
points
.
size
(
1
);
// num of rays
const
auto
T
=
grid
[
0
];
const
auto
H
=
grid
[
1
];
const
auto
L
=
grid
[
2
];
const
auto
W
=
grid
[
3
];
const
auto
dtype
=
points
.
dtype
();
const
auto
device
=
points
.
device
();
const
auto
options
=
torch
::
TensorOptions
().
dtype
(
dtype
).
device
(
device
).
requires_grad
(
false
);
auto
occupancy
=
torch
::
zeros
({
N
,
T
,
H
,
L
,
W
},
options
);
const
int
threads
=
1024
;
const
dim3
blocks
((
M
+
threads
-
1
)
/
threads
,
N
);
// initialize occupancy such that every voxel with one or more points is occupied
AT_DISPATCH_FLOATING_TYPES
(
points
.
type
(),
"init_cuda"
,
([
&
]
{
init_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
>>>
(
points
.
packed_accessor32
<
scalar_t
,
3
,
torch
::
RestrictPtrTraits
>
(),
tindex
.
packed_accessor32
<
scalar_t
,
2
,
torch
::
RestrictPtrTraits
>
(),
occupancy
.
packed_accessor32
<
scalar_t
,
5
,
torch
::
RestrictPtrTraits
>
());
}));
// synchronize
cudaDeviceSynchronize
();
return
occupancy
;
}
\ No newline at end of file
docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.hip
0 → 100644
View file @
d2b71343
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
// Modified by Haisong Liu
#include <torch/extension.h>
#include <stdio.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <vector>
#include <string>
#include <iostream>
#define MAX_D 1446 // 700 + 700 + 45 + 1
#define MAX_STEP 1000
enum LossType {L1, L2, ABSREL};
enum PhaseName {TEST, TRAIN};
template <typename scalar_t>
__global__ void init_cuda_kernel(
const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy) {
// batch index
const auto n = blockIdx.y;
// ray index
const auto c = blockIdx.x * blockDim.x + threadIdx.x;
// num of rays
const auto M = points.size(1);
const auto T = occupancy.size(1);
// we allocated more threads than num_rays
if (c < M) {
// ray end point
const auto t = tindex[n][c];
// invalid points
assert(T == 1 || t < T);
// if t < 0, it is a padded point
if (t < 0) return;
// time index for sigma
// when T = 1, we have a static sigma
const auto ts = (T == 1) ? 0 : t;
// grid shape
const int vzsize = occupancy.size(2);
const int vysize = occupancy.size(3);
const int vxsize = occupancy.size(4);
// assert(vzsize + vysize + vxsize <= MAX_D);
// end point
const int vx = int(points[n][c][0]);
const int vy = int(points[n][c][1]);
const int vz = int(points[n][c][2]);
//
if (0 <= vx && vx < vxsize &&
0 <= vy && vy < vysize &&
0 <= vz && vz < vzsize) {
occupancy[n][ts][vz][vy][vx] = 1;
}
}
}
template <typename scalar_t>
__global__ void render_forward_cuda_kernel(
const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
// torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> pog,
torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> coord_index,
PhaseName train_phase) {
// batch index
const auto n = blockIdx.y;
// ray index
const auto c = blockIdx.x * blockDim.x + threadIdx.x;
// num of rays
const auto M = points.size(1);
const auto T = sigma.size(1);
// we allocated more threads than num_rays
if (c < M) {
// ray end point
const auto t = tindex[n][c];
// invalid points
// assert(t < T);
assert(T == 1 || t < T);
// time index for sigma
// when T = 1, we have a static sigma
const auto ts = (T == 1) ? 0 : t;
// if t < 0, it is a padded point
if (t < 0) return;
// grid shape
const int vzsize = sigma.size(2);
const int vysize = sigma.size(3);
const int vxsize = sigma.size(4);
// assert(vzsize + vysize + vxsize <= MAX_D);
// origin
const double xo = origin[n][t][0];
const double yo = origin[n][t][1];
const double zo = origin[n][t][2];
// end point
const double xe = points[n][c][0];
const double ye = points[n][c][1];
const double ze = points[n][c][2];
// locate the voxel where the origin resides
const int vxo = int(xo);
const int vyo = int(yo);
const int vzo = int(zo);
const int vxe = int(xe);
const int vye = int(ye);
const int vze = int(ze);
// NOTE: new
int vx = vxo;
int vy = vyo;
int vz = vzo;
// origin to end
const double rx = xe - xo;
const double ry = ye - yo;
const double rz = ze - zo;
double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
// directional vector
const double dx = rx / gt_d;
const double dy = ry / gt_d;
const double dz = rz / gt_d;
// In which direction the voxel ids are incremented.
const int stepX = (dx >= 0) ? 1 : -1;
const int stepY = (dy >= 0) ? 1 : -1;
const int stepZ = (dz >= 0) ? 1 : -1;
// Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
// tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
// the value of t at which the ray crosses the first vertical voxel boundary
double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
// tDeltaX, tDeltaY, tDeltaZ --
// how far along the ray we must move for the horizontal component to equal the width of a voxel
// the direction in which we traverse the grid
// can only be FLT_MAX if we never go in that direction
const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
int3 path[MAX_D];
double csd[MAX_D]; // cumulative sum of sigma times delta
double p[MAX_D]; // alpha
double d[MAX_D];
// forward raymarching with voxel traversal
int step = 0; // total number of voxels traversed
int count = 0; // number of voxels traversed inside the voxel grid
double last_d = 0.0; // correct initialization
// voxel traversal raycasting
bool was_inside = false;
while (true) {
bool inside = (0 <= vx && vx < vxsize) &&
(0 <= vy && vy < vysize) &&
(0 <= vz && vz < vzsize);
if (inside) {
was_inside = true;
path[count] = make_int3(vx, vy, vz);
} else if (was_inside) { // was but no longer inside
// we know we are not coming back so terminate
break;
} /*else if (last_d > gt_d) {
break;
} */
/*else { // has not gone inside yet
// assert(count == 0);
// (1) when we have hit the destination but haven't gone inside the voxel grid
// (2) when we have traveled MAX_D voxels but haven't found one valid voxel
// handle intersection corner cases in case of infinite loop
bool hit = (vx == vxe && vy == vye && vz == vze); // this test seems brittle with corner cases
if (hit || step >= MAX_D)
break;
//if (last_d >= gt_d || step >= MAX_D) break;
} */
// _d represents the ray distance has traveled before escaping the current voxel cell
double _d = 0.0;
// voxel traversal
if (tMaxX < tMaxY) {
if (tMaxX < tMaxZ) {
_d = tMaxX;
vx += stepX;
tMaxX += tDeltaX;
} else {
_d = tMaxZ;
vz += stepZ;
tMaxZ += tDeltaZ;
}
} else {
if (tMaxY < tMaxZ) {
_d = tMaxY;
vy += stepY;
tMaxY += tDeltaY;
} else {
_d = tMaxZ;
vz += stepZ;
tMaxZ += tDeltaZ;
}
}
if (inside) {
// get sigma at the current voxel
const int3 &v = path[count]; // use the recorded index
const double _sigma = sigma[n][ts][v.z][v.y][v.x];
const double _delta = max(0.0, _d - last_d); // THIS TURNS OUT IMPORTANT
const double sd = _sigma * _delta;
if (count == 0) { // the first voxel inside
csd[count] = sd;
p[count] = 1 - exp(-sd);
} else {
csd[count] = csd[count-1] + sd;
p[count] = exp(-csd[count-1]) - exp(-csd[count]);
}
// record the traveled distance
d[count] = _d;
// count the number of voxels we have escaped
count ++;
}
last_d = _d;
step ++;
if (step > MAX_STEP) {
break;
}
}
// the total number of voxels visited should not exceed this number
assert(count <= MAX_D);
if (count > 0) {
// compute the expected ray distance
//double exp_d = 0.0;
double exp_d = d[count-1];
const int3 &v_init = path[count-1];
int x = v_init.x;
int y = v_init.y;
int z = v_init.z;
for (int i = 0; i < count; i++) {
//printf("%f\t%f\n",p[i], d[i]);
//exp_d += p[i] * d[i];
const int3 &v = path[i];
const double occ = sigma[n][ts][v.z][v.y][v.x];
if (occ > 0.5) {
exp_d = d[i];
x = v.x;
y = v.y;
z = v.z;
break;
}
}
//printf("%f\n",exp_d);
// add an imaginary sample at the end point should gt_d exceeds max_d
double p_out = exp(-csd[count-1]);
double max_d = d[count-1];
// if (gt_d > max_d)
// exp_d += (p_out * gt_d);
// p_out is the probability the ray escapes the voxel grid
//exp_d += (p_out * max_d);
if (train_phase == 1) {
gt_d = min(gt_d, max_d);
}
// write the rendered ray distance (max_d)
pred_dist[n][c] = exp_d;
gt_dist[n][c] = gt_d;
coord_index[n][c][0] = double(x);
coord_index[n][c][1] = double(y);
coord_index[n][c][2] = double(z);
// // write occupancy
// for (int i = 0; i < count; i ++) {
// const int3 &v = path[i];
// auto & occ = pog[n][t][v.z][v.y][v.x];
// if (p[i] >= occ) {
// occ = p[i];
// }
// }
}
}
}
/*
* input shape
* sigma : N x T x H x L x W
* origin : N x T x 3
* points : N x M x 4
* output shape
* dist : N x M
*/
std::vector<torch::Tensor> render_forward_cuda(
torch::Tensor sigma,
torch::Tensor origin,
torch::Tensor points,
torch::Tensor tindex,
const std::vector<int> grid,
std::string phase_name) {
const auto N = points.size(0); // batch size
const auto M = points.size(1); // num of rays
const auto T = grid[0];
const auto H = grid[1];
const auto L = grid[2];
const auto W = grid[3];
const auto device = sigma.device();
const int threads = 1024;
const dim3 blocks((M + threads - 1) / threads, N);
//
// const auto dtype = points.dtype();
// const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
// auto pog = torch::zeros({N, T, H, L, W}, options);
// perform rendering
auto gt_dist = -torch::ones({N, M}, device);
auto pred_dist = -torch::ones({N, M}, device);
auto coord_index = torch::zeros({N, M, 3}, device);
PhaseName train_phase;
if (phase_name.compare("test") == 0) {
train_phase = TEST;
} else if (phase_name.compare("train") == 0){
train_phase = TRAIN;
} else {
std::cout << "UNKNOWN PHASE NAME: " << phase_name << std::endl;
exit(1);
}
AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_forward_cuda", ([&] {
hipLaunchKernelGGL(( render_forward_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0,
sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
// pog.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
coord_index.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
train_phase);
}));
hipDeviceSynchronize();
// return {pog, pred_dist, gt_dist};
return {pred_dist, gt_dist, coord_index};
}
template <typename scalar_t>
__global__ void render_cuda_kernel(
const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
// const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy,
torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma,
// torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma_count,
LossType loss_type) {
// batch index
const auto n = blockIdx.y;
// ray index
const auto c = blockIdx.x * blockDim.x + threadIdx.x;
// num of rays
const auto M = points.size(1);
const auto T = sigma.size(1);
// we allocated more threads than num_rays
if (c < M) {
// ray end point
const auto t = tindex[n][c];
// invalid points
// assert(t < T);
assert(T == 1 || t < T);
// time index for sigma
// when T = 1, we have a static sigma
const auto ts = (T == 1) ? 0 : t;
// if t < 0, it is a padded point
if (t < 0) return;
// grid shape
const int vzsize = sigma.size(2);
const int vysize = sigma.size(3);
const int vxsize = sigma.size(4);
// assert(vzsize + vysize + vxsize <= MAX_D);
// origin
const double xo = origin[n][t][0];
const double yo = origin[n][t][1];
const double zo = origin[n][t][2];
// end point
const double xe = points[n][c][0];
const double ye = points[n][c][1];
const double ze = points[n][c][2];
// locate the voxel where the origin resides
const int vxo = int(xo);
const int vyo = int(yo);
const int vzo = int(zo);
//
const int vxe = int(xe);
const int vye = int(ye);
const int vze = int(ze);
// NOTE: new
int vx = vxo;
int vy = vyo;
int vz = vzo;
// origin to end
const double rx = xe - xo;
const double ry = ye - yo;
const double rz = ze - zo;
double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
// directional vector
const double dx = rx / gt_d;
const double dy = ry / gt_d;
const double dz = rz / gt_d;
// In which direction the voxel ids are incremented.
const int stepX = (dx >= 0) ? 1 : -1;
const int stepY = (dy >= 0) ? 1 : -1;
const int stepZ = (dz >= 0) ? 1 : -1;
// Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
// tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
// the value of t at which the ray crosses the first vertical voxel boundary
double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
// tDeltaX, tDeltaY, tDeltaZ --
// how far along the ray we must move for the horizontal component to equal the width of a voxel
// the direction in which we traverse the grid
// can only be FLT_MAX if we never go in that direction
const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
int3 path[MAX_D];
double csd[MAX_D]; // cumulative sum of sigma times delta
double p[MAX_D]; // alpha
double d[MAX_D];
double dt[MAX_D];
// forward raymarching with voxel traversal
int step = 0; // total number of voxels traversed
int count = 0; // number of voxels traversed inside the voxel grid
double last_d = 0.0; // correct initialization
// voxel traversal raycasting
bool was_inside = false;
while (true) {
bool inside = (0 <= vx && vx < vxsize) &&
(0 <= vy && vy < vysize) &&
(0 <= vz && vz < vzsize);
if (inside) { // now inside
was_inside = true;
path[count] = make_int3(vx, vy, vz);
} else if (was_inside) { // was inside but no longer
// we know we are not coming back so terminate
break;
} else if (last_d > gt_d) {
break;
} /* else { // has not gone inside yet
// assert(count == 0);
// (1) when we have hit the destination but haven't gone inside the voxel grid
// (2) when we have traveled MAX_D voxels but haven't found one valid voxel
// handle intersection corner cases in case of infinite loop
// bool hit = (vx == vxe && vy == vye && vz == vze);
// if (hit || step >= MAX_D)
// break;
if (last_d >= gt_d || step >= MAX_D) break;
} */
// _d represents the ray distance has traveled before escaping the current voxel cell
double _d = 0.0;
// voxel traversal
if (tMaxX < tMaxY) {
if (tMaxX < tMaxZ) {
_d = tMaxX;
vx += stepX;
tMaxX += tDeltaX;
} else {
_d = tMaxZ;
vz += stepZ;
tMaxZ += tDeltaZ;
}
} else {
if (tMaxY < tMaxZ) {
_d = tMaxY;
vy += stepY;
tMaxY += tDeltaY;
} else {
_d = tMaxZ;
vz += stepZ;
tMaxZ += tDeltaZ;
}
}
if (inside) {
// get sigma at the current voxel
const int3 &v = path[count]; // use the recorded index
const double _sigma = sigma[n][ts][v.z][v.y][v.x];
const double _delta = max(0.0, _d - last_d); // THIS TURNS OUT IMPORTANT
const double sd = _sigma * _delta;
if (count == 0) { // the first voxel inside
csd[count] = sd;
p[count] = 1 - exp(-sd);
} else {
csd[count] = csd[count-1] + sd;
p[count] = exp(-csd[count-1]) - exp(-csd[count]);
}
// record the traveled distance
d[count] = _d;
dt[count] = _delta;
// count the number of voxels we have escaped
count ++;
}
last_d = _d;
step ++;
if (step > MAX_STEP) {
break;
}
}
// the total number of voxels visited should not exceed this number
assert(count <= MAX_D);
// WHEN THERE IS AN INTERSECTION BETWEEN THE RAY AND THE VOXEL GRID
if (count > 0) {
// compute the expected ray distance
double exp_d = 0.0;
for (int i = 0; i < count; i ++)
exp_d += p[i] * d[i];
// add an imaginary sample at the end point should gt_d exceeds max_d
double p_out = exp(-csd[count-1]);
double max_d = d[count-1];
exp_d += (p_out * max_d);
gt_d = min(gt_d, max_d);
// write the rendered ray distance (max_d)
pred_dist[n][c] = exp_d;
gt_dist[n][c] = gt_d;
/* backward raymarching */
double dd_dsigma[MAX_D];
for (int i = count - 1; i >= 0; i --) {
// NOTE: probably need to double check again
if (i == count - 1)
dd_dsigma[i] = p_out * max_d;
else
dd_dsigma[i] = dd_dsigma[i+1] - exp(-csd[i]) * (d[i+1] - d[i]);
}
for (int i = count - 1; i >= 0; i --)
dd_dsigma[i] *= dt[i];
// option 2: cap at the boundary
for (int i = count - 1; i >= 0; i --)
dd_dsigma[i] -= dt[i] * p_out * max_d;
double dl_dd = 1.0;
if (loss_type == L1)
dl_dd = (exp_d >= gt_d) ? 1 : -1;
else if (loss_type == L2)
dl_dd = (exp_d - gt_d);
else if (loss_type == ABSREL)
dl_dd = (exp_d >= gt_d) ? (1.0/gt_d) : -(1.0/gt_d);
// apply chain rule
for (int i = 0; i < count; i ++) {
const int3 &v = path[i];
// NOTE: potential race conditions when writing gradients
grad_sigma[n][ts][v.z][v.y][v.x] += dl_dd * dd_dsigma[i];
// grad_sigma_count[n][ts][v.z][v.y][v.x] += 1;
}
}
}
}
/*
* input shape
* sigma : N x T x H x L x W
* origin : N x T x 3
* points : N x M x 4
* output shape
* dist : N x M
* loss : N x M
* grad_sigma : N x T x H x L x W
*/
std::vector<torch::Tensor> render_cuda(
torch::Tensor sigma,
torch::Tensor origin,
torch::Tensor points,
torch::Tensor tindex,
std::string loss_name) {
const auto N = points.size(0); // batch size
const auto M = points.size(1); // num of rays
const auto device = sigma.device();
const int threads = 1024;
const dim3 blocks((M + threads - 1) / threads, N);
// perform rendering
auto gt_dist = -torch::ones({N, M}, device);
auto pred_dist = -torch::ones({N, M}, device);
auto grad_sigma = torch::zeros_like(sigma);
// auto grad_sigma_count = torch::zeros_like(sigma);
LossType loss_type;
if (loss_name.compare("l1") == 0) {
loss_type = L1;
} else if (loss_name.compare("l2") == 0) {
loss_type = L2;
} else if (loss_name.compare("absrel") == 0) {
loss_type = ABSREL;
} else if (loss_name.compare("bce") == 0){
loss_type = L1;
} else {
std::cout << "UNKNOWN LOSS TYPE: " << loss_name << std::endl;
exit(1);
}
AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_cuda", ([&] {
hipLaunchKernelGGL(( render_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0,
sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
// occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
grad_sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
// grad_sigma_count.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
loss_type);
}));
hipDeviceSynchronize();
// grad_sigma_count += (grad_sigma_count == 0);
// grad_sigma /= grad_sigma_count;
return {pred_dist, gt_dist, grad_sigma};
}
/*
* input shape
* origin : N x T x 3
* points : N x M x 3
* tindex : N x M
* output shape
* occupancy: N x T x H x L x W
*/
torch::Tensor init_cuda(
torch::Tensor points,
torch::Tensor tindex,
const std::vector<int> grid) {
const auto N = points.size(0); // batch size
const auto M = points.size(1); // num of rays
const auto T = grid[0];
const auto H = grid[1];
const auto L = grid[2];
const auto W = grid[3];
const auto dtype = points.dtype();
const auto device = points.device();
const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
auto occupancy = torch::zeros({N, T, H, L, W}, options);
const int threads = 1024;
const dim3 blocks((M + threads - 1) / threads, N);
// initialize occupancy such that every voxel with one or more points is occupied
AT_DISPATCH_FLOATING_TYPES(points.type(), "init_cuda", ([&] {
hipLaunchKernelGGL(( init_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0,
points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
}));
// synchronize
hipDeviceSynchronize();
return occupancy;
}
\ No newline at end of file
mmdetection3d
@
c9541b0d
Subproject commit c9541b0db89498fdea5cafd05b7b17f7b625b858
docker-hub/FlashOCC/Flashocc/projects/__init__.py
0 → 100644
View file @
d2b71343
docker-hub/FlashOCC/Flashocc/projects/__pycache__/__init__.cpython-310.pyc
0 → 100644
View file @
d2b71343
File added
docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
0.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
32
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVStereo4DOCC'
,
align_after_view_transfromation
=
False
,
num_adj
=
len
(
range
(
*
multi_adj_frame_id_cfg
)),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
0
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVStereo'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
False
,
loss_depth_weight
=
0.05
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
,
stereo
=
True
,
bias
=
5.
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet3D'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_layer
=
[
1
,
2
,
4
],
with_cp
=
False
,
num_channels
=
[
numC_Trans
,
numC_Trans
*
2
,
numC_Trans
*
4
],
stride
=
[
1
,
2
,
2
],
backbone_output_ids
=
[
0
,
1
,
2
]),
img_bev_encoder_neck
=
dict
(
type
=
'LSSFPN3D'
,
in_channels
=
numC_Trans
*
7
,
out_channels
=
numC_Trans
),
pre_process
=
dict
(
type
=
'CustomResNet3D'
,
numC_input
=
numC_Trans
,
with_cp
=
False
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
occ_head
=
dict
(
type
=
'BEVOCCHead3D'
,
in_dim
=
numC_Trans
,
out_dim
=
32
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
True
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-stereo-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# with_pretrain:
# align_after_view_transfromation=False
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 8.22
# ===> barrier - IoU = 44.21
# ===> bicycle - IoU = 10.34
# ===> bus - IoU = 42.08
# ===> car - IoU = 49.63
# ===> construction_vehicle - IoU = 23.37
# ===> motorcycle - IoU = 17.41
# ===> pedestrian - IoU = 21.49
# ===> traffic_cone - IoU = 19.7
# ===> trailer - IoU = 31.33
# ===> truck - IoU = 37.09
# ===> driveable_surface - IoU = 80.13
# ===> other_flat - IoU = 37.37
# ===> sidewalk - IoU = 50.41
# ===> terrain - IoU = 54.29
# ===> manmade - IoU = 45.56
# ===> vegetation - IoU = 39.59
# ===> mIoU of 6019 samples: 36.01
docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
0.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
32
model
=
dict
(
type
=
'BEVDetOCC'
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformer'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
False
,
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet3D'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
2
,
4
],
with_cp
=
False
,
num_channels
=
[
numC_Trans
,
numC_Trans
*
2
,
numC_Trans
*
4
],
stride
=
[
1
,
2
,
2
],
backbone_output_ids
=
[
0
,
1
,
2
]),
img_bev_encoder_neck
=
dict
(
type
=
'LSSFPN3D'
,
in_channels
=
numC_Trans
*
7
,
out_channels
=
numC_Trans
),
occ_head
=
dict
(
type
=
'BEVOCCHead3D'
,
in_dim
=
numC_Trans
,
out_dim
=
32
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
True
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# with pretrain
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 6.65
# ===> barrier - IoU = 36.97
# ===> bicycle - IoU = 8.33
# ===> bus - IoU = 38.69
# ===> car - IoU = 44.46
# ===> construction_vehicle - IoU = 15.21
# ===> motorcycle - IoU = 13.67
# ===> pedestrian - IoU = 16.39
# ===> traffic_cone - IoU = 15.27
# ===> trailer - IoU = 27.11
# ===> truck - IoU = 31.04
# ===> driveable_surface - IoU = 78.7
# ===> other_flat - IoU = 36.45
# ===> sidewalk - IoU = 48.27
# ===> terrain - IoU = 51.68
# ===> manmade - IoU = 36.82
# ===> vegetation - IoU = 32.09
# ===> mIoU of 6019 samples: 31.64
# with det pretrain; use_mask=False; class_balance=True
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 4.36
# ===> barrier - IoU = 28.87
# ===> bicycle - IoU = 2.86
# ===> bus - IoU = 29.27
# ===> car - IoU = 32.45
# ===> construction_vehicle - IoU = 11.05
# ===> motorcycle - IoU = 12.82
# ===> pedestrian - IoU = 10.11
# ===> traffic_cone - IoU = 9.47
# ===> trailer - IoU = 7.93
# ===> truck - IoU = 21.58
# ===> driveable_surface - IoU = 49.85
# ===> other_flat - IoU = 25.5
# ===> sidewalk - IoU = 26.78
# ===> terrain - IoU = 21.14
# ===> manmade - IoU = 5.76
# ===> vegetation - IoU = 7.09
# ===> mIoU of 6019 samples: 18.05
\ No newline at end of file
docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
0 → 100644
View file @
d2b71343
# Copyright (c) Phigent Robotics. All rights reserved.
# align_after_view_transfromation=True
# align_after_view_transfromation=False
# 1x/12epoch
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 10.12
# ===> barrier - IoU = 48.06
# ===> bicycle - IoU = 0.0
# ===> bus - IoU = 51.19
# ===> car - IoU = 53.61
# ===> construction_vehicle - IoU = 27.15
# ===> motorcycle - IoU = 2.74
# ===> pedestrian - IoU = 28.3
# ===> traffic_cone - IoU = 23.33
# ===> trailer - IoU = 36.24
# ===> truck - IoU = 42.13
# ===> driveable_surface - IoU = 81.77
# ===> other_flat - IoU = 42.43
# ===> sidewalk - IoU = 53.67
# ===> terrain - IoU = 57.31
# ===> manmade - IoU = 48.27
# ===> vegetation - IoU = 43.31
# ===> mIoU of 6019 samples: 38.21
# 2x/24epoch
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 12.15
# ===> barrier - IoU = 49.63
# ===> bicycle - IoU = 25.1
# ===> bus - IoU = 52.02
# ===> car - IoU = 54.46
# ===> construction_vehicle - IoU = 27.87
# ===> motorcycle - IoU = 27.99
# ===> pedestrian - IoU = 28.94
# ===> traffic_cone - IoU = 27.23
# ===> trailer - IoU = 36.43
# ===> truck - IoU = 42.22
# ===> driveable_surface - IoU = 82.31
# ===> other_flat - IoU = 43.29
# ===> sidewalk - IoU = 54.62
# ===> terrain - IoU = 57.9
# ===> manmade - IoU = 48.61
# ===> vegetation - IoU = 43.55
# ===> mIoU of 6019 samples: 42.02
# 3x/36epoch
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 12.37
# ===> barrier - IoU = 50.15
# ===> bicycle - IoU = 26.97
# ===> bus - IoU = 51.86
# ===> car - IoU = 54.65
# ===> construction_vehicle - IoU = 28.38
# ===> motorcycle - IoU = 28.96
# ===> pedestrian - IoU = 29.02
# ===> traffic_cone - IoU = 28.28
# ===> trailer - IoU = 37.05
# ===> truck - IoU = 42.52
# ===> driveable_surface - IoU = 82.55
# ===> other_flat - IoU = 43.15
# ===> sidewalk - IoU = 54.87
# ===> terrain - IoU = 58.33
# ===> manmade - IoU = 48.78
# ===> vegetation - IoU = 43.79
# ===> mIoU of 6019 samples: 42.45
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
512
,
1408
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
# Model
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
0.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
32
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVStereo4DOCC'
,
align_after_view_transfromation
=
False
,
num_adj
=
len
(
range
(
*
multi_adj_frame_id_cfg
)),
img_backbone
=
dict
(
type
=
'SwinTransformer'
,
pretrain_img_size
=
224
,
patch_size
=
4
,
window_size
=
12
,
mlp_ratio
=
4
,
embed_dims
=
128
,
depths
=
[
2
,
2
,
18
,
2
],
num_heads
=
[
4
,
8
,
16
,
32
],
strides
=
(
4
,
2
,
2
,
2
),
out_indices
=
(
2
,
3
),
qkv_bias
=
True
,
qk_scale
=
None
,
patch_norm
=
True
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.1
,
use_abs_pos_embed
=
False
,
return_stereo_feat
=
True
,
act_cfg
=
dict
(
type
=
'GELU'
),
norm_cfg
=
dict
(
type
=
'LN'
,
requires_grad
=
True
),
pretrain_style
=
'official'
,
output_missing_index_as_none
=
False
),
img_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
512
+
1024
,
out_channels
=
512
,
# with_cp=False,
extra_upsample
=
None
,
input_feature_index
=
(
0
,
1
),
scale_factor
=
2
),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVStereo'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
False
,
loss_depth_weight
=
0.05
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
,
stereo
=
True
,
bias
=
5.
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet3D'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_layer
=
[
1
,
2
,
4
],
with_cp
=
False
,
num_channels
=
[
numC_Trans
,
numC_Trans
*
2
,
numC_Trans
*
4
],
stride
=
[
1
,
2
,
2
],
backbone_output_ids
=
[
0
,
1
,
2
]),
img_bev_encoder_neck
=
dict
(
type
=
'LSSFPN3D'
,
in_channels
=
numC_Trans
*
7
,
out_channels
=
numC_Trans
),
pre_process
=
dict
(
type
=
'CustomResNet3D'
,
numC_input
=
numC_Trans
,
with_cp
=
False
,
num_layer
=
[
1
,],
num_channels
=
[
numC_Trans
,],
stride
=
[
1
,],
backbone_output_ids
=
[
0
,]),
occ_head
=
dict
(
type
=
'BEVOCCHead3D'
,
in_dim
=
numC_Trans
,
out_dim
=
32
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
True
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
1
,
# with 32 GPU
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
dict
(
type
=
'SyncbnControlHook'
,
syncbn_start_epoch
=
0
,
),
]
load_from
=
"ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-4d-stereo.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVStereo4DOCC'
,
align_after_view_transfromation
=
False
,
num_adj
=
len
(
range
(
*
multi_adj_frame_id_cfg
)),
img_backbone
=
dict
(
pretrained
=
'torchvision://resnet50'
,
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
0
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVStereo'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
sid
=
True
,
loss_depth_weight
=
0.05
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
,
stereo
=
True
,
bias
=
5.
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
occ_head
=
dict
(
type
=
'BEVOCCHead2D'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
True
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"./ckpts/bevdet-r50-4d-stereo-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# with_pretrain:
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 9.08
# ===> barrier - IoU = 46.32
# ===> bicycle - IoU = 17.71
# ===> bus - IoU = 42.7
# ===> car - IoU = 50.64
# ===> construction_vehicle - IoU = 23.72
# ===> motorcycle - IoU = 20.13
# ===> pedestrian - IoU = 22.34
# ===> traffic_cone - IoU = 24.09
# ===> trailer - IoU = 30.26
# ===> truck - IoU = 37.39
# ===> driveable_surface - IoU = 81.68
# ===> other_flat - IoU = 40.13
# ===> sidewalk - IoU = 52.34
# ===> terrain - IoU = 56.46
# ===> manmade - IoU = 47.69
# ===> vegetation - IoU = 40.6
# ===> mIoU of 6019 samples: 37.84
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0-trt.py
0 → 100644
View file @
d2b71343
_base_
=
[
'./flashocc-r50-M0.py'
,
]
model
=
dict
(
wocc
=
True
,
wdet3d
=
False
,
)
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
1.0
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
64
model
=
dict
(
type
=
'BEVDetOCC'
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformer'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
True
,
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
128
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D'
,
in_dim
=
128
,
out_dim
=
128
,
Dz
=
16
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# with det pretrain; use_mask=True; out_dim=256,
# ===> per class IoU of 6019 samples:
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 6.21
# ===> barrier - IoU = 39.56
# ===> bicycle - IoU = 11.27
# ===> bus - IoU = 36.31
# ===> car - IoU = 43.96
# ===> construction_vehicle - IoU = 16.25
# ===> motorcycle - IoU = 14.74
# ===> pedestrian - IoU = 16.89
# ===> traffic_cone - IoU = 15.76
# ===> trailer - IoU = 28.56
# ===> truck - IoU = 30.91
# ===> driveable_surface - IoU = 78.16
# ===> other_flat - IoU = 37.52
# ===> sidewalk - IoU = 47.42
# ===> terrain - IoU = 51.35
# ===> manmade - IoU = 36.79
# ===> vegetation - IoU = 31.42
# ===> mIoU of 6019 samples: 31.95
# {'mIoU': array([0.06207982, 0.39564533, 0.11270112, 0.36311426, 0.43955401,
# 0.16252583, 0.14739984, 0.16885096, 0.15757262, 0.28564777,
# 0.30909029, 0.7815907 , 0.37523904, 0.47420705, 0.51351759,
# 0.36789645, 0.31420157, 0.87802724])}
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-trt.py
0 → 100644
View file @
d2b71343
_base_
=
[
'./flashocc-r50.py'
,
]
model
=
dict
(
wocc
=
True
,
wdet3d
=
False
,
)
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
64
model
=
dict
(
type
=
'BEVDetOCC'
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
#pretrained='torchvision://resnet50',
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformer'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
True
,
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D'
,
in_dim
=
256
,
out_dim
=
256
,
# out_dim=128 for M0!!!
Dz
=
16
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
24
,
workers_per_gpu
=
24
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# with det pretrain; use_mask=True;
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 6.74
# ===> barrier - IoU = 37.65
# ===> bicycle - IoU = 10.26
# ===> bus - IoU = 39.55
# ===> car - IoU = 44.36
# ===> construction_vehicle - IoU = 14.88
# ===> motorcycle - IoU = 13.4
# ===> pedestrian - IoU = 15.79
# ===> traffic_cone - IoU = 15.38
# ===> trailer - IoU = 27.44
# ===> truck - IoU = 31.73
# ===> driveable_surface - IoU = 78.82
# ===> other_flat - IoU = 37.98
# ===> sidewalk - IoU = 48.7
# ===> terrain - IoU = 52.5
# ===> manmade - IoU = 37.89
# ===> vegetation - IoU = 32.24
# ===> mIoU of 6019 samples: 32.08
# with det pretrain; use_mask=False; class_balance=True
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 4.49
# ===> barrier - IoU = 29.59
# ===> bicycle - IoU = 7.38
# ===> bus - IoU = 30.32
# ===> car - IoU = 32.22
# ===> construction_vehicle - IoU = 13.04
# ===> motorcycle - IoU = 11.91
# ===> pedestrian - IoU = 8.61
# ===> traffic_cone - IoU = 8.11
# ===> trailer - IoU = 7.66
# ===> truck - IoU = 20.84
# ===> driveable_surface - IoU = 48.59
# ===> other_flat - IoU = 26.62
# ===> sidewalk - IoU = 26.08
# ===> terrain - IoU = 20.86
# ===> manmade - IoU = 7.62
# ===> vegetation - IoU = 7.14
# ===> mIoU of 6019 samples: 18.3
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
512
,
1408
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
# Model
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVStereo4DOCC'
,
align_after_view_transfromation
=
False
,
num_adj
=
len
(
range
(
*
multi_adj_frame_id_cfg
)),
img_backbone
=
dict
(
type
=
'SwinTransformer'
,
pretrain_img_size
=
224
,
patch_size
=
4
,
window_size
=
12
,
mlp_ratio
=
4
,
embed_dims
=
128
,
depths
=
[
2
,
2
,
18
,
2
],
num_heads
=
[
4
,
8
,
16
,
32
],
strides
=
(
4
,
2
,
2
,
2
),
out_indices
=
(
2
,
3
),
qkv_bias
=
True
,
qk_scale
=
None
,
patch_norm
=
True
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.1
,
use_abs_pos_embed
=
False
,
return_stereo_feat
=
True
,
act_cfg
=
dict
(
type
=
'GELU'
),
norm_cfg
=
dict
(
type
=
'LN'
,
requires_grad
=
True
),
pretrain_style
=
'official'
,
output_missing_index_as_none
=
False
),
img_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
512
+
1024
,
out_channels
=
512
,
# with_cp=False,
extra_upsample
=
None
,
input_feature_index
=
(
0
,
1
),
scale_factor
=
2
),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVStereo'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
True
,
loss_depth_weight
=
0.05
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
,
stereo
=
True
,
bias
=
5.
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
with_cp
=
True
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
occ_head
=
dict
(
type
=
'BEVOCCHead2D'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
True
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
data_root
=
data_root
,
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
# with 32 GPU
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
dict
(
type
=
'SyncbnControlHook'
,
syncbn_start_epoch
=
0
,
),
]
evaluation
=
dict
(
interval
=
6
,
start
=
0
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
3
)
# load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
resume_from
=
"work_dirs/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2/epoch_5.pth"
# fp16 = dict(loss_scale='dynamic')
# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py 4
\ No newline at end of file
docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
512
,
1408
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
# Model
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVStereo4DOCC'
,
align_after_view_transfromation
=
False
,
num_adj
=
len
(
range
(
*
multi_adj_frame_id_cfg
)),
img_backbone
=
dict
(
type
=
'SwinTransformer'
,
pretrain_img_size
=
224
,
patch_size
=
4
,
window_size
=
12
,
mlp_ratio
=
4
,
embed_dims
=
128
,
depths
=
[
2
,
2
,
18
,
2
],
num_heads
=
[
4
,
8
,
16
,
32
],
strides
=
(
4
,
2
,
2
,
2
),
out_indices
=
(
2
,
3
),
qkv_bias
=
True
,
qk_scale
=
None
,
patch_norm
=
True
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.1
,
use_abs_pos_embed
=
False
,
return_stereo_feat
=
True
,
act_cfg
=
dict
(
type
=
'GELU'
),
norm_cfg
=
dict
(
type
=
'LN'
,
requires_grad
=
True
),
pretrain_style
=
'official'
,
output_missing_index_as_none
=
False
),
img_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
512
+
1024
,
out_channels
=
512
,
# with_cp=False,
extra_upsample
=
None
,
input_feature_index
=
(
0
,
1
),
scale_factor
=
2
),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVStereo'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
True
,
loss_depth_weight
=
0.05
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
,
stereo
=
True
,
bias
=
5.
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
with_cp
=
True
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
occ_head
=
dict
(
type
=
'BEVOCCHead2D'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_wise
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
True
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
data_root
=
data_root
,
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
# with 32 GPU
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
dict
(
type
=
'SyncbnControlHook'
,
syncbn_start_epoch
=
0
,
),
]
evaluation
=
dict
(
interval
=
6
,
start
=
0
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
3
)
load_from
=
"ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408.py 4
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 13.42
# ===> barrier - IoU = 51.07
# ===> bicycle - IoU = 27.68
# ===> bus - IoU = 51.57
# ===> car - IoU = 56.22
# ===> construction_vehicle - IoU = 27.27
# ===> motorcycle - IoU = 29.98
# ===> pedestrian - IoU = 29.93
# ===> traffic_cone - IoU = 29.8
# ===> trailer - IoU = 37.77
# ===> truck - IoU = 43.52
# ===> driveable_surface - IoU = 83.81
# ===> other_flat - IoU = 46.55
# ===> sidewalk - IoU = 56.15
# ===> terrain - IoU = 59.56
# ===> manmade - IoU = 50.84
# ===> vegetation - IoU = 44.67
# ===> mIoU of 6019 samples: 43.52
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 13.31
# ===> barrier - IoU = 51.62
# ===> bicycle - IoU = 28.07
# ===> bus - IoU = 50.91
# ===> car - IoU = 55.69
# ===> construction_vehicle - IoU = 27.46
# ===> motorcycle - IoU = 31.05
# ===> pedestrian - IoU = 29.98
# ===> traffic_cone - IoU = 29.2
# ===> trailer - IoU = 38.86
# ===> truck - IoU = 43.68
# ===> driveable_surface - IoU = 83.87
# ===> other_flat - IoU = 45.63
# ===> sidewalk - IoU = 56.33
# ===> terrain - IoU = 59.01
# ===> manmade - IoU = 50.63
# ===> vegetation - IoU = 44.56
# ===> mIoU of 6019 samples: 43.52
# {'mIoU': array([0.13311691, 0.51617081, 0.28070517, 0.50911942, 0.55694228,
# 0.27461342, 0.31050779, 0.29979125, 0.29204287, 0.38862984,
# 0.43680049, 0.83872518, 0.45630227, 0.56327839, 0.59008883,
# 0.50627122, 0.44564523, 0.90959399])}
\ No newline at end of file
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
40.0
,
-
40.0
,
-
5.0
,
40.0
,
40.0
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
model
=
dict
(
type
=
'BEVDepthPano'
,
# single-frame
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
aux_centerness_head
=
dict
(
type
=
'Centerness_Head'
,
task_specific_weight
=
[
1
,
1
,
0
,
0
,
0
],
in_channels
=
256
,
tasks
=
[
dict
(
num_class
=
10
,
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]),
],
common_heads
=
dict
(
reg
=
(
2
,
2
),
height
=
(
1
,
2
),
dim
=
(
3
,
2
),
rot
=
(
2
,
2
),
vel
=
(
2
,
2
)),
share_conv_channel
=
64
,
bbox_coder
=
dict
(
type
=
'CenterPointBBoxCoder'
,
pc_range
=
point_cloud_range
[:
2
],
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
max_num
=
500
,
score_threshold
=
0.3
,
#
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
code_size
=
9
),
separate_head
=
dict
(
type
=
'SeparateHead'
,
init_bias
=-
2.19
,
final_kernel
=
3
),
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
reduction
=
'mean'
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
reduction
=
'mean'
,
loss_weight
=
0.25
),
norm_bbox
=
True
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
point_cloud_range
=
point_cloud_range
,
grid_size
=
[
800
,
800
,
40
],
voxel_size
=
voxel_size
,
out_size_factor
=
4
,
dense_reg
=
1
,
gaussian_overlap
=
0.1
,
max_objs
=
500
,
min_radius
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
])),
test_cfg
=
dict
(
pts
=
dict
(
max_per_img
=
500
,
max_pool_nms
=
False
,
min_radius
=
[
4
,
12
,
10
,
1
,
0.85
,
0.175
],
score_threshold
=
0.1
,
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
pre_max_size
=
1000
,
post_max_size
=
500
,
# Scale-NMS
nms_type
=
[
'rotate'
],
nms_thr
=
[
0.2
],
nms_rescale_factor
=
[[
1.0
,
0.7
,
0.7
,
0.4
,
0.55
,
1.1
,
1.0
,
1.0
,
1.5
,
3.5
]]
)
),
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 10.21
# ===> barrier - IoU = 42.14
# ===> bicycle - IoU = 22.82
# ===> bus - IoU = 40.13
# ===> car - IoU = 42.86
# ===> construction_vehicle - IoU = 20.69
# ===> motorcycle - IoU = 24.58
# ===> pedestrian - IoU = 23.7
# ===> traffic_cone - IoU = 24.02
# ===> trailer - IoU = 25.48
# ===> truck - IoU = 30.9
# ===> driveable_surface - IoU = 58.65
# ===> other_flat - IoU = 32.04
# ===> sidewalk - IoU = 34.27
# ===> terrain - IoU = 31.12
# ===> manmade - IoU = 18.26
# ===> vegetation - IoU = 17.79
# ===> mIoU of 6019 samples: 29.39
# {'mIoU': array([0.102, 0.421, 0.228, 0.401, 0.429, 0.207, 0.246, 0.237, 0.24 ,
# 0.255, 0.309, 0.586, 0.32 , 0.343, 0.311, 0.183, 0.178, 0.833])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.090 | 0.102 | 0.105 |
# | barrier | 0.387 | 0.442 | 0.465 |
# | bicycle | 0.218 | 0.257 | 0.265 |
# | bus | 0.514 | 0.613 | 0.669 |
# | car | 0.487 | 0.564 | 0.592 |
# | construction_vehicle | 0.176 | 0.254 | 0.288 |
# | motorcycle | 0.203 | 0.292 | 0.310 |
# | pedestrian | 0.301 | 0.349 | 0.366 |
# | traffic_cone | 0.280 | 0.313 | 0.321 |
# | trailer | 0.227 | 0.313 | 0.390 |
# | truck | 0.395 | 0.493 | 0.537 |
# | driveable_surface | 0.534 | 0.618 | 0.708 |
# | other_flat | 0.289 | 0.326 | 0.356 |
# | sidewalk | 0.234 | 0.280 | 0.329 |
# | terrain | 0.222 | 0.291 | 0.356 |
# | manmade | 0.280 | 0.351 | 0.401 |
# | vegetation | 0.176 | 0.273 | 0.359 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.295 | 0.361 | 0.401 |
# +----------------------+----------+----------+----------+
# +----------------------+---------+---------+---------+
# | Class Names | RayPQ@1 | RayPQ@2 | RayPQ@4 |
# +----------------------+---------+---------+---------+
# | others | 0.017 | 0.025 | 0.026 |
# | barrier | 0.125 | 0.182 | 0.218 |
# | bicycle | 0.051 | 0.072 | 0.076 |
# | bus | 0.275 | 0.366 | 0.422 |
# | car | 0.242 | 0.332 | 0.356 |
# | construction_vehicle | 0.016 | 0.058 | 0.092 |
# | motorcycle | 0.071 | 0.124 | 0.137 |
# | pedestrian | 0.017 | 0.022 | 0.023 |
# | traffic_cone | 0.032 | 0.040 | 0.044 |
# | trailer | 0.035 | 0.055 | 0.063 |
# | truck | 0.145 | 0.232 | 0.282 |
# | driveable_surface | 0.410 | 0.537 | 0.665 |
# | other_flat | 0.062 | 0.087 | 0.109 |
# | sidewalk | 0.008 | 0.030 | 0.064 |
# | terrain | 0.010 | 0.026 | 0.047 |
# | manmade | 0.054 | 0.091 | 0.134 |
# | vegetation | 0.003 | 0.022 | 0.092 |
# +----------------------+---------+---------+---------+
# | MEAN | 0.092 | 0.135 | 0.168 |
# +----------------------+---------+---------+---------+
# {'RayIoU': 0.35223182059688496, 'RayIoU@1': 0.29499743138394385, 'RayIoU@2': 0.3607063492639709, 'RayIoU@4': 0.4009916811427401,
# 'RayPQ': 0.13182524545677765, 'RayPQ@1': 0.09247682620339576, 'RayPQ@2': 0.1354024129684159, 'RayPQ@4': 0.16759649719852124}
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
0 → 100644
View file @
d2b71343
_base_
=
[
'./flashoccv2-r50-depth-tiny-pano.py'
,
]
model
=
dict
(
wocc
=
True
,
wdet3d
=
False
,
)
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
40.0
,
-
40.0
,
-
5.0
,
40.0
,
40.0
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
1.0
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
64
model
=
dict
(
type
=
'BEVDepthPano'
,
# single-frame
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
128
),
aux_centerness_head
=
dict
(
type
=
'Centerness_Head'
,
task_specific_weight
=
[
1
,
1
,
0
,
0
,
0
],
in_channels
=
128
,
tasks
=
[
dict
(
num_class
=
10
,
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]),
],
common_heads
=
dict
(
reg
=
(
2
,
2
),
height
=
(
1
,
2
),
dim
=
(
3
,
2
),
rot
=
(
2
,
2
),
vel
=
(
2
,
2
)),
share_conv_channel
=
64
,
bbox_coder
=
dict
(
type
=
'CenterPointBBoxCoder'
,
pc_range
=
point_cloud_range
[:
2
],
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
max_num
=
500
,
score_threshold
=
0.3
,
#
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
code_size
=
9
),
separate_head
=
dict
(
type
=
'SeparateHead'
,
init_bias
=-
2.19
,
final_kernel
=
3
),
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
reduction
=
'mean'
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
reduction
=
'mean'
,
loss_weight
=
0.25
),
norm_bbox
=
True
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
128
,
out_dim
=
128
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
point_cloud_range
=
point_cloud_range
,
grid_size
=
[
800
,
800
,
40
],
voxel_size
=
voxel_size
,
out_size_factor
=
4
,
dense_reg
=
1
,
gaussian_overlap
=
0.1
,
max_objs
=
500
,
min_radius
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
])),
test_cfg
=
dict
(
pts
=
dict
(
max_per_img
=
500
,
max_pool_nms
=
False
,
min_radius
=
[
4
,
12
,
10
,
1
,
0.85
,
0.175
],
score_threshold
=
0.1
,
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
pre_max_size
=
1000
,
post_max_size
=
500
,
# Scale-NMS
nms_type
=
[
'rotate'
],
nms_thr
=
[
0.2
],
nms_rescale_factor
=
[[
1.0
,
0.7
,
0.7
,
0.4
,
0.55
,
1.1
,
1.0
,
1.0
,
1.5
,
3.5
]]
)
),
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 10.33
# ===> barrier - IoU = 41.02
# ===> bicycle - IoU = 22.16
# ===> bus - IoU = 39.75
# ===> car - IoU = 42.63
# ===> construction_vehicle - IoU = 20.53
# ===> motorcycle - IoU = 24.01
# ===> pedestrian - IoU = 23.71
# ===> traffic_cone - IoU = 24.65
# ===> trailer - IoU = 25.58
# ===> truck - IoU = 30.63
# ===> driveable_surface - IoU = 58.0
# ===> other_flat - IoU = 32.12
# ===> sidewalk - IoU = 33.78
# ===> terrain - IoU = 31.02
# ===> manmade - IoU = 17.67
# ===> vegetation - IoU = 17.74
# ===> mIoU of 6019 samples: 29.14
# {'mIoU': array([0.103, 0.41 , 0.222, 0.397, 0.426, 0.205, 0.24 , 0.237, 0.246,
# 0.256, 0.306, 0.58 , 0.321, 0.338, 0.31 , 0.177, 0.177, 0.832])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.095 | 0.107 | 0.110 |
# | barrier | 0.374 | 0.429 | 0.452 |
# | bicycle | 0.208 | 0.242 | 0.248 |
# | bus | 0.498 | 0.603 | 0.659 |
# | car | 0.489 | 0.568 | 0.598 |
# | construction_vehicle | 0.171 | 0.247 | 0.279 |
# | motorcycle | 0.190 | 0.277 | 0.298 |
# | pedestrian | 0.295 | 0.344 | 0.361 |
# | traffic_cone | 0.290 | 0.324 | 0.332 |
# | trailer | 0.207 | 0.292 | 0.368 |
# | truck | 0.411 | 0.507 | 0.551 |
# | driveable_surface | 0.531 | 0.614 | 0.704 |
# | other_flat | 0.286 | 0.325 | 0.357 |
# | sidewalk | 0.234 | 0.280 | 0.328 |
# | terrain | 0.220 | 0.290 | 0.356 |
# | manmade | 0.267 | 0.343 | 0.392 |
# | vegetation | 0.174 | 0.272 | 0.358 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.291 | 0.357 | 0.397 |
# +----------------------+----------+----------+----------+
# 6019it [09:34, 10.48it/s]
# +----------------------+---------+---------+---------+
# | Class Names | RayPQ@1 | RayPQ@2 | RayPQ@4 |
# +----------------------+---------+---------+---------+
# | others | 0.017 | 0.024 | 0.025 |
# | barrier | 0.107 | 0.169 | 0.204 |
# | bicycle | 0.069 | 0.086 | 0.088 |
# | bus | 0.244 | 0.350 | 0.408 |
# | car | 0.238 | 0.326 | 0.352 |
# | construction_vehicle | 0.018 | 0.081 | 0.105 |
# | motorcycle | 0.061 | 0.105 | 0.117 |
# | pedestrian | 0.016 | 0.022 | 0.023 |
# | traffic_cone | 0.030 | 0.049 | 0.052 |
# | trailer | 0.029 | 0.047 | 0.056 |
# | truck | 0.151 | 0.240 | 0.286 |
# | driveable_surface | 0.407 | 0.531 | 0.662 |
# | other_flat | 0.054 | 0.078 | 0.098 |
# | sidewalk | 0.009 | 0.030 | 0.061 |
# | terrain | 0.006 | 0.022 | 0.045 |
# | manmade | 0.044 | 0.091 | 0.128 |
# | vegetation | 0.001 | 0.021 | 0.091 |
# +----------------------+---------+---------+---------+
# | MEAN | 0.088 | 0.134 | 0.165 |
# +----------------------+---------+---------+---------+
# {'RayIoU': 0.34819957391233375, 'RayIoU@1': 0.29065973127346445, 'RayIoU@2': 0.3566749015912661, 'RayIoU@4': 0.39726408887227066,
# 'RayPQ': 0.12890890185841564, 'RayPQ@1': 0.08832135839934552, 'RayPQ@2': 0.1336058084882046, 'RayPQ@4': 0.1647995386876968}
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
0 → 100644
View file @
d2b71343
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
1.0
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
64
model
=
dict
(
type
=
'BEVDepthOCC'
,
# single-frame
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
128
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
128
,
out_dim
=
128
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# use_mask = False
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 10.69
# ===> barrier - IoU = 39.67
# ===> bicycle - IoU = 22.01
# ===> bus - IoU = 39.99
# ===> car - IoU = 40.46
# ===> construction_vehicle - IoU = 20.44
# ===> motorcycle - IoU = 24.52
# ===> pedestrian - IoU = 22.5
# ===> traffic_cone - IoU = 23.72
# ===> trailer - IoU = 25.93
# ===> truck - IoU = 29.75
# ===> driveable_surface - IoU = 58.29
# ===> other_flat - IoU = 31.46
# ===> sidewalk - IoU = 33.92
# ===> terrain - IoU = 31.25
# ===> manmade - IoU = 17.46
# ===> vegetation - IoU = 17.97
# ===> mIoU of 6019 samples: 28.83
# {'mIoU': array([0.1068576 , 0.3967071 , 0.220114 , 0.3998965 , 0.40462457,
# 0.20442682, 0.24516316, 0.22497209, 0.23719173, 0.25925541,
# 0.29754347, 0.58293305, 0.31458314, 0.33921965, 0.31254221,
# 0.17456574, 0.17970859, 0.8315865 ])}
# Starting Evaluation...
# 6019it [10:23, 9.65it/s]
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.094 | 0.107 | 0.111 |
# | barrier | 0.367 | 0.421 | 0.443 |
# | bicycle | 0.209 | 0.251 | 0.261 |
# | bus | 0.498 | 0.601 | 0.665 |
# | car | 0.472 | 0.550 | 0.581 |
# | construction_vehicle | 0.175 | 0.251 | 0.287 |
# | motorcycle | 0.205 | 0.292 | 0.315 |
# | pedestrian | 0.289 | 0.339 | 0.354 |
# | traffic_cone | 0.276 | 0.302 | 0.314 |
# | trailer | 0.203 | 0.289 | 0.380 |
# | truck | 0.396 | 0.493 | 0.546 |
# | driveable_surface | 0.528 | 0.611 | 0.702 |
# | other_flat | 0.280 | 0.315 | 0.346 |
# | sidewalk | 0.233 | 0.279 | 0.328 |
# | terrain | 0.218 | 0.286 | 0.353 |
# | manmade | 0.268 | 0.347 | 0.398 |
# | vegetation | 0.174 | 0.272 | 0.358 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.287 | 0.353 | 0.397 |
# +----------------------+----------+----------+----------+
# {'RayIoU': 0.34574739050176573, 'RayIoU@1': 0.2873820616941079, 'RayIoU@2': 0.3533573712072785,
# 'RayIoU@4': 0.39650273860391083}
docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
0 → 100644
View file @
d2b71343
_base_
=
[
'./flashoccv2-r50-depth.py'
,
]
model
=
dict
(
wocc
=
True
,
wdet3d
=
False
,
)
Prev
1
2
3
4
5
6
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment