Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
096bbf96
Unverified
Commit
096bbf96
authored
Nov 28, 2023
by
Muhammed Fatih BALIN
Committed by
GitHub
Nov 29, 2023
Browse files
[Graphbolt][CUDA] Optimize UVA index select (#6632)
parent
a28f1f9a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
66 additions
and
25 deletions
+66
-25
graphbolt/src/cuda/index_select_impl.cu
graphbolt/src/cuda/index_select_impl.cu
+48
-22
tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
.../pytorch/graphbolt/impl/test_torch_based_feature_store.py
+18
-3
No files found.
graphbolt/src/cuda/index_select_impl.cu
View file @
096bbf96
...
...
@@ -3,6 +3,7 @@
* @file cuda/index_select_impl.cu
* @brief Index select operator implementation on CUDA.
*/
#include <c10/core/ScalarType.h>
#include <c10/cuda/CUDAStream.h>
#include <torch/script.h>
...
...
@@ -101,14 +102,16 @@ template <typename DType, typename IdType>
torch
::
Tensor
UVAIndexSelectImpl_
(
torch
::
Tensor
input
,
torch
::
Tensor
index
)
{
const
int64_t
input_len
=
input
.
size
(
0
);
const
int64_t
return_len
=
index
.
size
(
0
);
const
int64_t
feature_size
=
std
::
accumulate
(
input
.
sizes
().
begin
()
+
1
,
input
.
sizes
().
end
(),
1
,
std
::
multiplies
<>
());
const
int64_t
original_feature_size
=
std
::
accumulate
(
input
.
sizes
().
begin
()
+
1
,
input
.
sizes
().
end
(),
1ll
,
std
::
multiplies
<>
());
const
auto
aligned_feature_size
=
input
.
element_size
()
*
original_feature_size
/
sizeof
(
DType
);
torch
::
Tensor
ret
=
torch
::
empty
(
{
return_len
,
feature_size
},
torch
::
TensorOptions
()
{
return_len
,
original_
feature_size
},
torch
::
TensorOptions
()
.
dtype
(
input
.
dtype
())
.
device
(
c10
::
DeviceType
::
CUDA
));
DType
*
input_ptr
=
input
.
data_ptr
<
DType
>
();
DType
*
ret_ptr
=
re
t
.
data_ptr
<
DType
>
();
DType
*
input_ptr
=
reinterpret_cast
<
DType
*>
(
input
.
data_ptr
(
)
);
DType
*
ret_ptr
=
re
interpret_cast
<
DType
*
>
(
ret
.
data_ptr
()
);
// Sort the index to improve the memory access pattern.
torch
::
Tensor
sorted_index
,
permutation
;
...
...
@@ -118,7 +121,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
cudaStream_t
stream
=
torch
::
cuda
::
getDefaultCUDAStream
();
if
(
feature_size
==
1
)
{
if
(
aligned_
feature_size
==
1
)
{
// Use a single thread to process each output row to avoid wasting threads.
const
int
num_threads
=
cuda
::
FindNumThreads
(
return_len
);
const
int
num_blocks
=
(
return_len
+
num_threads
-
1
)
/
num_threads
;
...
...
@@ -127,23 +130,24 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
input_len
,
index_sorted_ptr
,
return_len
,
ret_ptr
,
permutation_ptr
);
}
else
{
dim3
block
(
512
,
1
);
while
(
static_cast
<
int64_t
>
(
block
.
x
)
>=
2
*
feature_size
)
{
while
(
static_cast
<
int64_t
>
(
block
.
x
)
>=
2
*
aligned_
feature_size
)
{
block
.
x
>>=
1
;
block
.
y
<<=
1
;
}
const
dim3
grid
((
return_len
+
block
.
y
-
1
)
/
block
.
y
);
if
(
feature_size
*
sizeof
(
DType
)
<=
GPU_CACHE_LINE_SIZE
)
{
if
(
aligned_
feature_size
*
sizeof
(
DType
)
<=
GPU_CACHE_LINE_SIZE
)
{
// When feature size is smaller than GPU cache line size, use unaligned
// version for less SM usage, which is more resource efficient.
CUDA_KERNEL_CALL
(
IndexSelectMultiKernel
,
grid
,
block
,
0
,
stream
,
input_ptr
,
input_len
,
feature_size
,
index_sorted_ptr
,
return_len
,
ret_ptr
,
permutation_ptr
);
aligned_feature_size
,
index_sorted_ptr
,
return_len
,
ret_ptr
,
permutation_ptr
);
}
else
{
// Use aligned version to improve the memory access pattern.
CUDA_KERNEL_CALL
(
IndexSelectMultiKernelAligned
,
grid
,
block
,
0
,
stream
,
input_ptr
,
input_len
,
feature_size
,
index_sorted_ptr
,
return_len
,
ret_ptr
,
permutation_ptr
);
input_len
,
aligned_
feature_size
,
index_sorted_ptr
,
return_len
,
ret_ptr
,
permutation_ptr
);
}
}
...
...
@@ -157,18 +161,40 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
/**
* @brief UVA index select operator implementation on CUDA.
*
*
The supporting input types are: float, double, int, int64_
t.
*
All basic torch types are supported for inpu
t.
* The supporting index types are: int, int64_t.
*/
torch
::
Tensor
UVAIndexSelectImpl
(
torch
::
Tensor
input
,
torch
::
Tensor
index
)
{
return
AT_DISPATCH_FLOATING_TYPES_AND2
(
at
::
ScalarType
::
Int
,
at
::
ScalarType
::
Long
,
input
.
scalar_type
(),
"UVAIndexSelectImpl"
,
[
&
]
{
return
AT_DISPATCH_INDEX_TYPES
(
index
.
scalar_type
(),
"UVAIndexSelectImpl"
,
[
&
]
{
return
UVAIndexSelectImpl_
<
scalar_t
,
index_t
>
(
input
,
index
);
});
});
index
.
scalar_type
(),
"UVAIndexSelectImpl"
,
([
&
]
{
const
auto
ptr
=
(
size_t
)
input
.
data_ptr
();
const
int64_t
feature_size
=
std
::
accumulate
(
input
.
sizes
().
begin
()
+
1
,
input
.
sizes
().
end
(),
1ll
,
std
::
multiplies
<>
());
// We perform the copy with datatype of size powers of 2, and the
// maximum data type we use has 16 bytes. We check the alignment of the
// pointer and the feature dimensionality to determine the largest
// type to use for the copy to minimize the number of CUDA threads used.
// Alignment denotes the maximum suitable alignment and datatype size
// for the copies.
const
int
aligned_access_size
=
std
::
gcd
(
16
,
std
::
gcd
(
ptr
,
input
.
element_size
()
*
feature_size
));
switch
(
aligned_access_size
)
{
case
1
:
return
UVAIndexSelectImpl_
<
uint8_t
,
index_t
>
(
input
,
index
);
case
2
:
return
UVAIndexSelectImpl_
<
uint16_t
,
index_t
>
(
input
,
index
);
case
4
:
return
UVAIndexSelectImpl_
<
uint32_t
,
index_t
>
(
input
,
index
);
case
8
:
return
UVAIndexSelectImpl_
<
uint64_t
,
index_t
>
(
input
,
index
);
case
16
:
return
UVAIndexSelectImpl_
<
float4
,
index_t
>
(
input
,
index
);
default:
TORCH_CHECK
(
false
,
"UVAIndexSelectImpl: Unreachable code path!"
);
return
torch
::
Tensor
{};
}
}));
}
}
// namespace ops
...
...
tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
View file @
096bbf96
...
...
@@ -141,12 +141,27 @@ def test_torch_based_feature(in_memory):
reason
=
"Tests for pinned memory are only meaningful on GPU."
,
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
float64
,
torch
.
int32
,
torch
.
int64
]
"dtype"
,
[
torch
.
float32
,
torch
.
float64
,
torch
.
int32
,
torch
.
int64
,
torch
.
int8
,
torch
.
float16
,
torch
.
complex128
,
],
)
@
pytest
.
mark
.
parametrize
(
"idtype"
,
[
torch
.
int32
,
torch
.
int64
])
@
pytest
.
mark
.
parametrize
(
"shape"
,
[(
2
,
1
),
(
2
,
3
),
(
2
,
2
,
2
)])
@
pytest
.
mark
.
parametrize
(
"shape"
,
[(
2
,
1
),
(
2
,
3
),
(
2
,
2
,
2
)
,
(
137
,
13
,
3
)
])
def
test_torch_based_pinned_feature
(
dtype
,
idtype
,
shape
):
tensor
=
torch
.
arange
(
0
,
reduce
(
mul
,
shape
),
dtype
=
dtype
).
reshape
(
shape
)
if
dtype
==
torch
.
complex128
:
tensor
=
torch
.
complex
(
torch
.
randint
(
0
,
13
,
shape
,
dtype
=
torch
.
float64
),
torch
.
randint
(
0
,
13
,
shape
,
dtype
=
torch
.
float64
),
)
else
:
tensor
=
torch
.
randint
(
0
,
13
,
shape
,
dtype
=
dtype
)
test_tensor
=
tensor
.
clone
().
detach
()
test_tensor_cuda
=
test_tensor
.
cuda
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment