Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
105 additions
and
81 deletions
+105
-81
graphbolt/src/cuda/common.h
graphbolt/src/cuda/common.h
+20
-19
graphbolt/src/cuda/cumsum.hip
graphbolt/src/cuda/cumsum.hip
+3
-2
graphbolt/src/cuda/expand_indptr.hip
graphbolt/src/cuda/expand_indptr.hip
+4
-3
graphbolt/src/cuda/gpu_cache.hip
graphbolt/src/cuda/gpu_cache.hip
+3
-2
graphbolt/src/cuda/index_select_csc_impl.hip
graphbolt/src/cuda/index_select_csc_impl.hip
+8
-6
graphbolt/src/cuda/index_select_impl.hip
graphbolt/src/cuda/index_select_impl.hip
+7
-5
graphbolt/src/cuda/insubgraph.hip
graphbolt/src/cuda/insubgraph.hip
+2
-1
graphbolt/src/cuda/isin.hip
graphbolt/src/cuda/isin.hip
+2
-1
graphbolt/src/cuda/max_uva_threads.cc
graphbolt/src/cuda/max_uva_threads.cc
+2
-1
graphbolt/src/cuda/neighbor_sampler.hip
graphbolt/src/cuda/neighbor_sampler.hip
+15
-12
graphbolt/src/cuda/sampling_utils.hip
graphbolt/src/cuda/sampling_utils.hip
+6
-5
graphbolt/src/cuda/sort_impl.hip
graphbolt/src/cuda/sort_impl.hip
+4
-3
graphbolt/src/cuda/unique_and_compact_impl.hip
graphbolt/src/cuda/unique_and_compact_impl.hip
+5
-4
graphbolt/src/expand_indptr.cc
graphbolt/src/expand_indptr.cc
+3
-2
graphbolt/src/fused_csc_sampling_graph.cc
graphbolt/src/fused_csc_sampling_graph.cc
+5
-4
graphbolt/src/index_select.cc
graphbolt/src/index_select.cc
+3
-2
graphbolt/src/isin.cc
graphbolt/src/isin.cc
+3
-2
graphbolt/src/python_binding.cc
graphbolt/src/python_binding.cc
+6
-5
graphbolt/src/random.cc
graphbolt/src/random.cc
+2
-1
graphbolt/src/shared_memory_helper.cc
graphbolt/src/shared_memory_helper.cc
+2
-1
No files found.
graphbolt/src/cuda/common.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2017-2023 by Contributors
* Copyright (c) 2017-2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -7,11 +8,11 @@
...
@@ -7,11 +8,11 @@
#ifndef GRAPHBOLT_CUDA_COMMON_H_
#ifndef GRAPHBOLT_CUDA_COMMON_H_
#define GRAPHBOLT_CUDA_COMMON_H_
#define GRAPHBOLT_CUDA_COMMON_H_
#include <ATen/
cuda/CUDA
Event.h>
#include <ATen/
hip/HIP
Event.h>
#include <
c10/cuda/CUDACachingAllocator
.h>
#include <
ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA
.h>
#include <c10/
cuda/CUDA
Exception.h>
#include <c10/
hip/HIP
Exception.h>
#include <
c10/cuda/CUDAStream
.h>
#include <
ATen/hip/impl/HIPStreamMasqueradingAsCUDA
.h>
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <torch/script.h>
#include <torch/script.h>
#include <memory>
#include <memory>
...
@@ -26,8 +27,8 @@ namespace cuda {
...
@@ -26,8 +27,8 @@ namespace cuda {
* that uses torch's CUDA memory pool and the current cuda stream:
* that uses torch's CUDA memory pool and the current cuda stream:
*
*
* cuda::CUDAWorkspaceAllocator allocator;
* cuda::CUDAWorkspaceAllocator allocator;
* const auto stream = torch::
cuda
::getDefault
CUDA
Stream();
* const auto stream = torch::
hip
::getDefault
HIP
Stream
MasqueradingAsCUDA
();
* const auto exec_policy = thrust::
cuda
::par_nosync(allocator).on(stream);
* const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
*
*
* Now, one can pass exec_policy to thrust functions
* Now, one can pass exec_policy to thrust functions
*
*
...
@@ -47,13 +48,13 @@ struct CUDAWorkspaceAllocator {
...
@@ -47,13 +48,13 @@ struct CUDAWorkspaceAllocator {
CUDAWorkspaceAllocator
&
operator
=
(
const
CUDAWorkspaceAllocator
&
)
=
default
;
CUDAWorkspaceAllocator
&
operator
=
(
const
CUDAWorkspaceAllocator
&
)
=
default
;
void
operator
()(
void
*
ptr
)
const
{
void
operator
()(
void
*
ptr
)
const
{
c10
::
cuda
::
CUDA
CachingAllocator
::
raw_delete
(
ptr
);
c10
::
hip
::
HIP
CachingAllocator
::
raw_delete
(
ptr
);
}
}
// Required by thrust to satisfy allocator requirements.
// Required by thrust to satisfy allocator requirements.
value_type
*
allocate
(
std
::
ptrdiff_t
size
)
const
{
value_type
*
allocate
(
std
::
ptrdiff_t
size
)
const
{
return
reinterpret_cast
<
value_type
*>
(
return
reinterpret_cast
<
value_type
*>
(
c10
::
cuda
::
CUDA
CachingAllocator
::
raw_alloc
(
size
));
c10
::
hip
::
HIP
CachingAllocator
::
raw_alloc
(
size
));
}
}
// Required by thrust to satisfy allocator requirements.
// Required by thrust to satisfy allocator requirements.
...
@@ -69,7 +70,7 @@ struct CUDAWorkspaceAllocator {
...
@@ -69,7 +70,7 @@ struct CUDAWorkspaceAllocator {
inline
auto
GetAllocator
()
{
return
CUDAWorkspaceAllocator
{};
}
inline
auto
GetAllocator
()
{
return
CUDAWorkspaceAllocator
{};
}
inline
auto
GetCurrentStream
()
{
return
c10
::
cuda
::
getCurrent
CUDA
Stream
();
}
inline
auto
GetCurrentStream
()
{
return
c10
::
hip
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
}
template
<
typename
T
>
template
<
typename
T
>
inline
bool
is_zero
(
T
size
)
{
inline
bool
is_zero
(
T
size
)
{
...
@@ -81,15 +82,15 @@ inline bool is_zero<dim3>(dim3 size) {
...
@@ -81,15 +82,15 @@ inline bool is_zero<dim3>(dim3 size) {
return
size
.
x
==
0
||
size
.
y
==
0
||
size
.
z
==
0
;
return
size
.
x
==
0
||
size
.
y
==
0
||
size
.
z
==
0
;
}
}
#define CUDA_CALL(func) C10_
CUDA
_CHECK((func))
#define CUDA_CALL(func) C10_
HIP
_CHECK((func))
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \
{ \
{ \
if (!graphbolt::cuda::is_zero((nblks)) && \
if (!graphbolt::cuda::is_zero((nblks)) && \
!graphbolt::cuda::is_zero((nthrs))) { \
!graphbolt::cuda::is_zero((nthrs))) { \
auto stream = graphbolt::cuda::GetCurrentStream(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
(kernel)
<<<
(nblks), (nthrs), (shmem), stream
>>>(
__VA_ARGS__); \
hipLaunchKernelGGL((
(kernel)
), dim3(
(nblks)
)
,
dim3(
(nthrs)
)
, (shmem), stream
,
__VA_ARGS__); \
C10_
CUDA
_KERNEL_LAUNCH_CHECK(); \
C10_
HIP
_KERNEL_LAUNCH_CHECK(); \
} \
} \
}
}
...
@@ -98,16 +99,16 @@ inline bool is_zero<dim3>(dim3 size) {
...
@@ -98,16 +99,16 @@ inline bool is_zero<dim3>(dim3 size) {
auto allocator = graphbolt::cuda::GetAllocator(); \
auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
size_t workspace_size = 0; \
size_t workspace_size = 0; \
CUDA_CALL(cub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \
CUDA_CALL(
hip
cub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \
auto workspace = allocator.AllocateStorage<char>(workspace_size); \
auto workspace = allocator.AllocateStorage<char>(workspace_size); \
CUDA_CALL(cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
CUDA_CALL(
hip
cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
}
}
#define THRUST_CALL(fn, ...) \
#define THRUST_CALL(fn, ...) \
[&] { \
[&] { \
auto allocator = graphbolt::cuda::GetAllocator(); \
auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
const auto exec_policy = thrust::
cuda
::par_nosync(allocator).on(stream); \
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream); \
return thrust::fn(exec_policy, __VA_ARGS__); \
return thrust::fn(exec_policy, __VA_ARGS__); \
}()
}()
...
@@ -126,7 +127,7 @@ template <typename scalar_t>
...
@@ -126,7 +127,7 @@ template <typename scalar_t>
struct
CopyScalar
{
struct
CopyScalar
{
CopyScalar
()
:
is_ready_
(
true
)
{
init_pinned_storage
();
}
CopyScalar
()
:
is_ready_
(
true
)
{
init_pinned_storage
();
}
void
record
(
at
::
cuda
::
CUDAStream
stream
=
GetCurrentStream
())
{
void
record
(
at
::
hip
::
HIPStreamMasqueradingAsCUDA
stream
=
GetCurrentStream
())
{
copy_event_
.
record
(
stream
);
copy_event_
.
record
(
stream
);
is_ready_
=
false
;
is_ready_
=
false
;
}
}
...
@@ -138,9 +139,9 @@ struct CopyScalar {
...
@@ -138,9 +139,9 @@ struct CopyScalar {
CopyScalar
(
const
scalar_t
*
device_ptr
)
{
CopyScalar
(
const
scalar_t
*
device_ptr
)
{
init_pinned_storage
();
init_pinned_storage
();
auto
stream
=
GetCurrentStream
();
auto
stream
=
GetCurrentStream
();
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL
(
hip
MemcpyAsync
(
reinterpret_cast
<
scalar_t
*>
(
pinned_scalar_
.
data_ptr
()),
device_ptr
,
reinterpret_cast
<
scalar_t
*>
(
pinned_scalar_
.
data_ptr
()),
device_ptr
,
sizeof
(
scalar_t
),
cuda
MemcpyDeviceToHost
,
stream
));
sizeof
(
scalar_t
),
hip
MemcpyDeviceToHost
,
stream
));
record
(
stream
);
record
(
stream
);
}
}
...
...
graphbolt/src/cuda/cumsum.
cu
→
graphbolt/src/cuda/cumsum.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/cumsum.cu
* @file cuda/cumsum.cu
* @brief Cumsum operators implementation on CUDA.
* @brief Cumsum operators implementation on CUDA.
*/
*/
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "
./
common.h"
#include "common.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
...
graphbolt/src/cuda/expand_indptr.
cu
→
graphbolt/src/cuda/expand_indptr.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -8,10 +9,10 @@
...
@@ -8,10 +9,10 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <limits>
#include <limits>
#include "
./
common.h"
#include "common.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -86,7 +87,7 @@ torch::Tensor ExpandIndptrImpl(
...
@@ -86,7 +87,7 @@ torch::Tensor ExpandIndptrImpl(
CUB_CALL(
CUB_CALL(
DeviceCopy::Batched, input_buffer + i,
DeviceCopy::Batched, input_buffer + i,
output_buffer + i, buffer_sizes + i,
output_buffer + i, buffer_sizes + i,
std
::
min
(
num_rows
-
i
,
max_copy_at_once
));
::min(num_rows - i, max_copy_at_once));
}
}
}));
}));
}));
}));
...
...
graphbolt/src/cuda/gpu_cache.
cu
→
graphbolt/src/cuda/gpu_cache.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -6,8 +7,8 @@
...
@@ -6,8 +7,8 @@
*/
*/
#include <numeric>
#include <numeric>
#include "
./
common.h"
#include "common.h"
#include "
./
gpu_cache.h"
#include "gpu_cache.h"
namespace graphbolt {
namespace graphbolt {
namespace cuda {
namespace cuda {
...
...
graphbolt/src/cuda/index_select_csc_impl.
cu
→
graphbolt/src/cuda/index_select_csc_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -10,12 +12,12 @@
...
@@ -10,12 +12,12 @@
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include <numeric>
#include "
./
common.h"
#include "common.h"
#include "
./
max_uva_threads.h"
#include "max_uva_threads.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
...
@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
const dim3 block(BLOCK_SIZE);
const dim3 block(BLOCK_SIZE);
const dim3 grid(
const dim3 grid(
(
std
::
min
(
edge_count_aligned
,
cuda
::
max_uva_threads
.
value_or
(
1
<<
20
))
+
(::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
BLOCK_SIZE - 1) /
BLOCK_SIZE - 1) /
BLOCK_SIZE);
BLOCK_SIZE);
...
@@ -220,7 +222,7 @@ void IndexSelectCSCCopyIndices(
...
@@ -220,7 +222,7 @@ void IndexSelectCSCCopyIndices(
for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
CUB_CALL(
CUB_CALL(
DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
buffer_sizes
+
i
,
std
::
min
(
num_nodes
-
i
,
max_copy_at_once
));
buffer_sizes + i, ::min(num_nodes - i, max_copy_at_once));
}
}
}
}
...
...
graphbolt/src/cuda/index_select_impl.
cu
→
graphbolt/src/cuda/index_select_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -9,9 +11,9 @@
...
@@ -9,9 +11,9 @@
#include <numeric>
#include <numeric>
#include "
./
common.h"
#include "common.h"
#include "
./
max_uva_threads.h"
#include "max_uva_threads.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -124,7 +126,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
...
@@ -124,7 +126,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
// Use a single thread to process each output row to avoid wasting threads.
// Use a single thread to process each output row to avoid wasting threads.
const int num_threads = cuda::FindNumThreads(return_len);
const int num_threads = cuda::FindNumThreads(return_len);
const int num_blocks =
const int num_blocks =
(
std
::
min
(
return_len
,
cuda
::
max_uva_threads
.
value_or
(
1
<<
20
))
+
(::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
num_threads - 1) /
num_threads - 1) /
num_threads;
num_threads;
CUDA_KERNEL_CALL(
CUDA_KERNEL_CALL(
...
@@ -137,7 +139,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
...
@@ -137,7 +139,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
block.x >>= 1;
block.x >>= 1;
block.y <<= 1;
block.y <<= 1;
}
}
const
dim3
grid
(
std
::
min
(
const dim3 grid(::min(
(return_len + block.y - 1) / block.y,
(return_len + block.y - 1) / block.y,
cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {
if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {
...
...
graphbolt/src/cuda/insubgraph.
cu
→
graphbolt/src/cuda/insubgraph.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -8,7 +9,7 @@
...
@@ -8,7 +9,7 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include "
./
common.h"
#include "common.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
...
graphbolt/src/cuda/isin.
cu
→
graphbolt/src/cuda/isin.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -7,7 +8,7 @@
...
@@ -7,7 +8,7 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <thrust/binary_search.h>
#include <thrust/binary_search.h>
#include "
./
common.h"
#include "common.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
...
graphbolt/src/cuda/max_uva_threads.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/max_uva_threads.cc
* @file cuda/max_uva_threads.cc
* @brief Max uva threads variable setter function.
* @brief Max uva threads variable setter function.
*/
*/
#include "
./
max_uva_threads.h"
#include "max_uva_threads.h"
namespace
graphbolt
{
namespace
graphbolt
{
namespace
cuda
{
namespace
cuda
{
...
...
graphbolt/src/cuda/neighbor_sampler.
cu
→
graphbolt/src/cuda/neighbor_sampler.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "hip/hip_bf16.h"
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -5,7 +8,7 @@
...
@@ -5,7 +8,7 @@
* @brief Index select operator implementation on CUDA.
* @brief Index select operator implementation on CUDA.
*/
*/
#include <c10/core/ScalarType.h>
#include <c10/core/ScalarType.h>
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include <thrust/gather.h>
#include <thrust/gather.h>
...
@@ -15,14 +18,14 @@
...
@@ -15,14 +18,14 @@
#include <algorithm>
#include <algorithm>
#include <array>
#include <array>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <limits>
#include <limits>
#include <numeric>
#include <numeric>
#include <type_traits>
#include <type_traits>
#include "../random.h"
#include "../random.h"
#include "
./
common.h"
#include "common.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -44,11 +47,11 @@ __global__ void _ComputeRandoms(
...
@@ -44,11 +47,11 @@ __global__ void _ComputeRandoms(
const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) {
const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) {
int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = gridDim.x * blockDim.x;
const int stride = gridDim.x * blockDim.x;
cu
randStatePhilox4_32_10_t
rng
;
hip
randStatePhilox4_32_10_t rng;
const auto labor = indices != nullptr;
const auto labor = indices != nullptr;
if (!labor) {
if (!labor) {
cu
rand_init
(
random_seed
,
i
,
0
,
&
rng
);
hip
rand_init(random_seed, i, 0, &rng);
}
}
while (i < num_edges) {
while (i < num_edges) {
...
@@ -58,10 +61,10 @@ __global__ void _ComputeRandoms(
...
@@ -58,10 +61,10 @@ __global__ void _ComputeRandoms(
if (labor) {
if (labor) {
constexpr uint64_t kCurandSeed = 999961;
constexpr uint64_t kCurandSeed = 999961;
cu
rand_init
(
kCurandSeed
,
random_seed
,
indices
[
in_idx
],
&
rng
);
hip
rand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
}
}
const
auto
rnd
=
cu
rand_uniform
(
&
rng
);
const auto rnd =
hip
rand_uniform(&rng);
const auto prob =
const auto prob =
sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
const auto exp_rnd = -__logf(rnd);
const auto exp_rnd = -__logf(rnd);
...
@@ -152,9 +155,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
...
@@ -152,9 +155,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
}
}
// Finally, copy the adjusted fanout values to the device memory.
// Finally, copy the adjusted fanout values to the device memory.
auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL(
hip
MemcpyAsync(
fanouts_device.get(), fanouts_pinned_ptr,
fanouts_device.get(), fanouts_pinned_ptr,
sizeof
(
int64_t
)
*
fanouts
.
size
(),
cuda
MemcpyHostToDevice
,
sizeof(int64_t) * fanouts.size(),
hip
MemcpyHostToDevice,
cuda::GetCurrentStream()));
cuda::GetCurrentStream()));
auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
...
@@ -271,7 +274,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
...
@@ -271,7 +274,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
"Selected edge_id_t must be capable of storing edge_ids.");
"Selected edge_id_t must be capable of storing edge_ids.");
// Using bfloat16 for random numbers works just as reliably as
// Using bfloat16 for random numbers works just as reliably as
// float32 and provides around %30 percent speedup.
// float32 and provides around %30 percent speedup.
using
rnd_t
=
nv
_bfloat16
;
using rnd_t =
__hip
_bfloat16;
auto randoms =
auto randoms =
allocator.AllocateStorage<rnd_t>(num_edges.value());
allocator.AllocateStorage<rnd_t>(num_edges.value());
auto randoms_sorted =
auto randoms_sorted =
...
@@ -362,7 +365,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
...
@@ -362,7 +365,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
CUB_CALL(
CUB_CALL(
DeviceCopy::Batched, input_buffer_it + i,
DeviceCopy::Batched, input_buffer_it + i,
output_buffer_it + i, sampled_degree + i,
output_buffer_it + i, sampled_degree + i,
std
::
min
(
num_rows
-
i
,
max_copy_at_once
));
::min(num_rows - i, max_copy_at_once));
}
}
}));
}));
...
...
graphbolt/src/cuda/sampling_utils.
cu
→
graphbolt/src/cuda/sampling_utils.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -7,10 +8,10 @@
...
@@ -7,10 +8,10 @@
#include <thrust/for_each.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "
./
common.h"
#include "common.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -72,7 +73,7 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
...
@@ -72,7 +73,7 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
CUB_CALL(
CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy,
DeviceAdjacentDifference::SubtractLeftCopy,
indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
num_nodes
+
1
,
cub
::
Difference
{});
num_nodes + 1,
hip
cub::Difference{});
}));
}));
in_degree = in_degree.slice(0, 1);
in_degree = in_degree.slice(0, 1);
return {in_degree, sliced_indptr};
return {in_degree, sliced_indptr};
...
@@ -126,7 +127,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
...
@@ -126,7 +127,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
CUB_CALL(
CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy,
DeviceAdjacentDifference::SubtractLeftCopy,
new_sub_indptr.data_ptr<indptr_t>(),
new_sub_indptr.data_ptr<indptr_t>(),
new_indegree
.
data_ptr
<
indptr_t
>
(),
num_rows
+
1
,
cub
::
Difference
{});
new_indegree.data_ptr<indptr_t>(), num_rows + 1,
hip
cub::Difference{});
}));
}));
// Discard the first element of the SubtractLeftCopy result and ensure that
// Discard the first element of the SubtractLeftCopy result and ensure that
// new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is
// new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is
...
...
graphbolt/src/cuda/sort_impl.
cu
→
graphbolt/src/cuda/sort_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -6,10 +7,10 @@
...
@@ -6,10 +7,10 @@
*/
*/
#include <c10/core/ScalarType.h>
#include <c10/core/ScalarType.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "
./
common.h"
#include "common.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
...
graphbolt/src/cuda/unique_and_compact_impl.
cu
→
graphbolt/src/cuda/unique_and_compact_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -10,11 +11,11 @@
...
@@ -10,11 +11,11 @@
#include <thrust/gather.h>
#include <thrust/gather.h>
#include <thrust/logical.h>
#include <thrust/logical.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <type_traits>
#include <type_traits>
#include "
./
common.h"
#include "common.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -97,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
...
@@ -97,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
// and max_id_dst.
// and max_id_dst.
if (num_bits == 0) {
if (num_bits == 0) {
num_bits = cuda::NumberOfBits(
num_bits = cuda::NumberOfBits(
1
+
std
::
max
(
1 + ::max(
static_cast<scalar_t>(max_id_src),
static_cast<scalar_t>(max_id_src),
static_cast<scalar_t>(max_id_dst)));
static_cast<scalar_t>(max_id_dst)));
}
}
...
...
graphbolt/src/expand_indptr.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -6,8 +7,8 @@
...
@@ -6,8 +7,8 @@
*/
*/
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include "
./
macro.h"
#include "macro.h"
#include "
./
utils.h"
#include "utils.h"
namespace
graphbolt
{
namespace
graphbolt
{
namespace
ops
{
namespace
ops
{
...
...
graphbolt/src/fused_csc_sampling_graph.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* @file fused_csc_sampling_graph.cc
* @file fused_csc_sampling_graph.cc
...
@@ -17,10 +18,10 @@
...
@@ -17,10 +18,10 @@
#include <tuple>
#include <tuple>
#include <vector>
#include <vector>
#include "
./
macro.h"
#include "macro.h"
#include "
./
random.h"
#include "random.h"
#include "
./
shared_memory_helper.h"
#include "shared_memory_helper.h"
#include "
./
utils.h"
#include "utils.h"
namespace
{
namespace
{
torch
::
optional
<
torch
::
Dict
<
std
::
string
,
torch
::
Tensor
>>
TensorizeDict
(
torch
::
optional
<
torch
::
Dict
<
std
::
string
,
torch
::
Tensor
>>
TensorizeDict
(
...
...
graphbolt/src/index_select.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* @file index_select.cc
* @file index_select.cc
...
@@ -6,8 +7,8 @@
...
@@ -6,8 +7,8 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/fused_csc_sampling_graph.h>
#include <graphbolt/fused_csc_sampling_graph.h>
#include "
./
macro.h"
#include "macro.h"
#include "
./
utils.h"
#include "utils.h"
namespace
graphbolt
{
namespace
graphbolt
{
namespace
ops
{
namespace
ops
{
...
...
graphbolt/src/isin.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
*
*
...
@@ -8,8 +9,8 @@
...
@@ -8,8 +9,8 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/isin.h>
#include <graphbolt/isin.h>
#include "
./
macro.h"
#include "macro.h"
#include "
./
utils.h"
#include "utils.h"
namespace
{
namespace
{
static
constexpr
int
kSearchGrainSize
=
4096
;
static
constexpr
int
kSearchGrainSize
=
4096
;
...
...
graphbolt/src/python_binding.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* @file python_binding.cc
* @file python_binding.cc
...
@@ -10,14 +11,14 @@
...
@@ -10,14 +11,14 @@
#include <graphbolt/unique_and_compact.h>
#include <graphbolt/unique_and_compact.h>
#ifdef GRAPHBOLT_USE_CUDA
#ifdef GRAPHBOLT_USE_CUDA
#include "
./
cuda/max_uva_threads.h"
#include "cuda/max_uva_threads.h"
#endif
#endif
#include "
./
expand_indptr.h"
#include "expand_indptr.h"
#include "
./
index_select.h"
#include "index_select.h"
#include "
./
random.h"
#include "random.h"
#ifdef GRAPHBOLT_USE_CUDA
#ifdef GRAPHBOLT_USE_CUDA
#include "
./
cuda/gpu_cache.h"
#include "cuda/gpu_cache.h"
#endif
#endif
namespace
graphbolt
{
namespace
graphbolt
{
...
...
graphbolt/src/random.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* @file random.cc
* @file random.cc
* @brief Random Engine.
* @brief Random Engine.
*/
*/
#include "
./
random.h"
#include "random.h"
#include <torch/torch.h>
#include <torch/torch.h>
...
...
graphbolt/src/shared_memory_helper.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
*
*
* @file shared_memory_helper.cc
* @file shared_memory_helper.cc
* @brief Share memory helper implementation.
* @brief Share memory helper implementation.
*/
*/
#include "
./
shared_memory_helper.h"
#include "shared_memory_helper.h"
#include <graphbolt/serialize.h>
#include <graphbolt/serialize.h>
#include <graphbolt/shared_memory.h>
#include <graphbolt/shared_memory.h>
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment