Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
maskrcnn
Commits
2ea43786
Commit
2ea43786
authored
May 06, 2025
by
zhanggzh
Browse files
add maskrcnn code
parent
d06288b2
Pipeline
#2686
canceled with stages
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
142 additions
and
0 deletions
+142
-0
NVIDIA/benchmarks/maskrcnn/implementations/pytorch/maskrcnn/maskrcnn_benchmark/csrc/cuda/syncfree/compat.h
...h/maskrcnn/maskrcnn_benchmark/csrc/cuda/syncfree/compat.h
+9
-0
NVIDIA/benchmarks/maskrcnn/implementations/pytorch/maskrcnn/maskrcnn_benchmark/csrc/cuda/syncfree/multi_tensor_apply.cuh
...krcnn_benchmark/csrc/cuda/syncfree/multi_tensor_apply.cuh
+133
-0
No files found.
NVIDIA/benchmarks/maskrcnn/implementations/pytorch/maskrcnn/maskrcnn_benchmark/csrc/cuda/syncfree/compat.h
0 → 100644
View file @
2ea43786
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
NVIDIA/benchmarks/maskrcnn/implementations/pytorch/maskrcnn/maskrcnn_benchmark/csrc/cuda/syncfree/multi_tensor_apply.cuh
0 → 100644
View file @
2ea43786
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include "compat.h"
#include <assert.h>
// #include <iostream>
// This header is the one-stop shop for all your multi-tensor apply needs.
// TODO: Kernel arg size limit may be <4KB for some other cards (ie Jetson)
constexpr
int
depth_to_max_tensors
[
6
]
=
{
110
,
64
,
48
,
36
,
30
,
24
};
constexpr
int
depth_to_max_blocks
[
6
]
=
{
320
,
320
,
320
,
320
,
320
,
320
};
template
<
int
n
>
struct
TensorListMetadata
{
void
*
addresses
[
n
][
depth_to_max_tensors
[
n
-
1
]];
int
sizes
[
depth_to_max_tensors
[
n
-
1
]];
unsigned
char
block_to_tensor
[
depth_to_max_blocks
[
n
-
1
]];
int
block_to_chunk
[
depth_to_max_blocks
[
n
-
1
]];
// I fear this needs to be a full int.
int
start_tensor_this_launch
;
};
template
<
typename
T
,
typename
U
,
typename
...
ArgTypes
>
__global__
void
multi_tensor_apply_kernel
(
int
chunk_size
,
volatile
int
*
noop_flag
,
T
tl
,
U
callable
,
ArgTypes
...
args
)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable
(
chunk_size
,
noop_flag
,
tl
,
args
...);
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
void
multi_tensor_apply
(
int
block_size
,
int
chunk_size
,
const
at
::
Tensor
&
noop_flag
,
const
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>&
tensor_lists
,
T
callable
,
ArgTypes
...
args
)
{
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
// No range-based for because I need indices
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
{
// TODO: Print which tensor fails.
bool
contiguous_memory
=
tensor_lists
[
l
][
t
].
is_contiguous
();
#ifdef VERSION_GE_1_5
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
)
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast3d
));
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
int
ntensors
=
tensor_lists
[
0
].
size
();
TensorListMetadata
<
depth
>
tl
;
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
tl
.
sizes
[
loc_tensor_info
]
=
tensor_lists
[
0
][
t
].
numel
();
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
loc_tensor_info
]
=
tensor_lists
[
d
][
t
].
data_ptr
();
loc_tensor_info
++
;
int
chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
for
(
int
chunk
=
0
;
chunk
<
chunks_this_tensor
;
chunk
++
)
{
// std::cout << chunks_this_tensor << std::endl;
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
bool
last_chunk
=
(
t
==
ntensors
-
1
&&
chunk
==
chunks_this_tensor
-
1
);
if
(
tensors_full
||
blocks_full
||
last_chunk
)
{
// using accscalar_t = acc_type<scalar_t, true>;
multi_tensor_apply_kernel
<<<
loc_block_info
,
block_size
,
0
,
stream
>>>
(
chunk_size
,
noop_flag
.
DATA_PTR
<
int
>
(),
tl
,
callable
,
args
...);
AT_CUDA_CHECK
(
cudaGetLastError
());
// Reset. The control flow possibilities here make my brain hurt.
loc_block_info
=
0
;
if
(
chunk
==
chunks_this_tensor
-
1
)
{
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
else
{
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
tl
.
sizes
[
0
]
=
tl
.
sizes
[
loc_tensor_info
-
1
];
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
0
]
=
tl
.
addresses
[
d
][
loc_tensor_info
-
1
];
loc_tensor_info
=
1
;
tl
.
start_tensor_this_launch
=
t
;
}
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment