Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6ac701f8
"vscode:/vscode.git/clone" did not exist on "68bd6934b1e683b6dcf2c9257db05ea5af69f1c5"
Commit
6ac701f8
authored
Sep 13, 2024
by
sangwzh
Browse files
update src and graphbolt code
parent
1547bd93
Changes
116
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
105 additions
and
81 deletions
+105
-81
graphbolt/src/cuda/common.h
graphbolt/src/cuda/common.h
+20
-19
graphbolt/src/cuda/cumsum.hip
graphbolt/src/cuda/cumsum.hip
+3
-2
graphbolt/src/cuda/expand_indptr.hip
graphbolt/src/cuda/expand_indptr.hip
+4
-3
graphbolt/src/cuda/gpu_cache.hip
graphbolt/src/cuda/gpu_cache.hip
+3
-2
graphbolt/src/cuda/index_select_csc_impl.hip
graphbolt/src/cuda/index_select_csc_impl.hip
+8
-6
graphbolt/src/cuda/index_select_impl.hip
graphbolt/src/cuda/index_select_impl.hip
+7
-5
graphbolt/src/cuda/insubgraph.hip
graphbolt/src/cuda/insubgraph.hip
+2
-1
graphbolt/src/cuda/isin.hip
graphbolt/src/cuda/isin.hip
+2
-1
graphbolt/src/cuda/max_uva_threads.cc
graphbolt/src/cuda/max_uva_threads.cc
+2
-1
graphbolt/src/cuda/neighbor_sampler.hip
graphbolt/src/cuda/neighbor_sampler.hip
+15
-12
graphbolt/src/cuda/sampling_utils.hip
graphbolt/src/cuda/sampling_utils.hip
+6
-5
graphbolt/src/cuda/sort_impl.hip
graphbolt/src/cuda/sort_impl.hip
+4
-3
graphbolt/src/cuda/unique_and_compact_impl.hip
graphbolt/src/cuda/unique_and_compact_impl.hip
+5
-4
graphbolt/src/expand_indptr.cc
graphbolt/src/expand_indptr.cc
+3
-2
graphbolt/src/fused_csc_sampling_graph.cc
graphbolt/src/fused_csc_sampling_graph.cc
+5
-4
graphbolt/src/index_select.cc
graphbolt/src/index_select.cc
+3
-2
graphbolt/src/isin.cc
graphbolt/src/isin.cc
+3
-2
graphbolt/src/python_binding.cc
graphbolt/src/python_binding.cc
+6
-5
graphbolt/src/random.cc
graphbolt/src/random.cc
+2
-1
graphbolt/src/shared_memory_helper.cc
graphbolt/src/shared_memory_helper.cc
+2
-1
No files found.
graphbolt/src/cuda/common.h
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2017-2023 by Contributors
* Copyright (c) 2017-2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -7,11 +8,11 @@
...
@@ -7,11 +8,11 @@
#ifndef GRAPHBOLT_CUDA_COMMON_H_
#ifndef GRAPHBOLT_CUDA_COMMON_H_
#define GRAPHBOLT_CUDA_COMMON_H_
#define GRAPHBOLT_CUDA_COMMON_H_
#include <ATen/
cuda/CUDA
Event.h>
#include <ATen/
hip/HIP
Event.h>
#include <
c10/cuda/CUDACachingAllocator
.h>
#include <
ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA
.h>
#include <c10/
cuda/CUDA
Exception.h>
#include <c10/
hip/HIP
Exception.h>
#include <
c10/cuda/CUDAStream
.h>
#include <
ATen/hip/impl/HIPStreamMasqueradingAsCUDA
.h>
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include <torch/script.h>
#include <torch/script.h>
#include <memory>
#include <memory>
...
@@ -26,8 +27,8 @@ namespace cuda {
...
@@ -26,8 +27,8 @@ namespace cuda {
* that uses torch's CUDA memory pool and the current cuda stream:
* that uses torch's CUDA memory pool and the current cuda stream:
*
*
* cuda::CUDAWorkspaceAllocator allocator;
* cuda::CUDAWorkspaceAllocator allocator;
* const auto stream = torch::
cuda
::getDefault
CUDA
Stream();
* const auto stream = torch::
hip
::getDefault
HIP
Stream
MasqueradingAsCUDA
();
* const auto exec_policy = thrust::
cuda
::par_nosync(allocator).on(stream);
* const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream);
*
*
* Now, one can pass exec_policy to thrust functions
* Now, one can pass exec_policy to thrust functions
*
*
...
@@ -47,13 +48,13 @@ struct CUDAWorkspaceAllocator {
...
@@ -47,13 +48,13 @@ struct CUDAWorkspaceAllocator {
CUDAWorkspaceAllocator
&
operator
=
(
const
CUDAWorkspaceAllocator
&
)
=
default
;
CUDAWorkspaceAllocator
&
operator
=
(
const
CUDAWorkspaceAllocator
&
)
=
default
;
void
operator
()(
void
*
ptr
)
const
{
void
operator
()(
void
*
ptr
)
const
{
c10
::
cuda
::
CUDA
CachingAllocator
::
raw_delete
(
ptr
);
c10
::
hip
::
HIP
CachingAllocator
::
raw_delete
(
ptr
);
}
}
// Required by thrust to satisfy allocator requirements.
// Required by thrust to satisfy allocator requirements.
value_type
*
allocate
(
std
::
ptrdiff_t
size
)
const
{
value_type
*
allocate
(
std
::
ptrdiff_t
size
)
const
{
return
reinterpret_cast
<
value_type
*>
(
return
reinterpret_cast
<
value_type
*>
(
c10
::
cuda
::
CUDA
CachingAllocator
::
raw_alloc
(
size
));
c10
::
hip
::
HIP
CachingAllocator
::
raw_alloc
(
size
));
}
}
// Required by thrust to satisfy allocator requirements.
// Required by thrust to satisfy allocator requirements.
...
@@ -69,7 +70,7 @@ struct CUDAWorkspaceAllocator {
...
@@ -69,7 +70,7 @@ struct CUDAWorkspaceAllocator {
inline
auto
GetAllocator
()
{
return
CUDAWorkspaceAllocator
{};
}
inline
auto
GetAllocator
()
{
return
CUDAWorkspaceAllocator
{};
}
inline
auto
GetCurrentStream
()
{
return
c10
::
cuda
::
getCurrent
CUDA
Stream
();
}
inline
auto
GetCurrentStream
()
{
return
c10
::
hip
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
}
template
<
typename
T
>
template
<
typename
T
>
inline
bool
is_zero
(
T
size
)
{
inline
bool
is_zero
(
T
size
)
{
...
@@ -81,15 +82,15 @@ inline bool is_zero<dim3>(dim3 size) {
...
@@ -81,15 +82,15 @@ inline bool is_zero<dim3>(dim3 size) {
return
size
.
x
==
0
||
size
.
y
==
0
||
size
.
z
==
0
;
return
size
.
x
==
0
||
size
.
y
==
0
||
size
.
z
==
0
;
}
}
#define CUDA_CALL(func) C10_
CUDA
_CHECK((func))
#define CUDA_CALL(func) C10_
HIP
_CHECK((func))
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \
{ \
{ \
if (!graphbolt::cuda::is_zero((nblks)) && \
if (!graphbolt::cuda::is_zero((nblks)) && \
!graphbolt::cuda::is_zero((nthrs))) { \
!graphbolt::cuda::is_zero((nthrs))) { \
auto stream = graphbolt::cuda::GetCurrentStream(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
(kernel)
<<<
(nblks), (nthrs), (shmem), stream
>>>(
__VA_ARGS__); \
hipLaunchKernelGGL((
(kernel)
), dim3(
(nblks)
)
,
dim3(
(nthrs)
)
, (shmem), stream
,
__VA_ARGS__); \
C10_
CUDA
_KERNEL_LAUNCH_CHECK(); \
C10_
HIP
_KERNEL_LAUNCH_CHECK(); \
} \
} \
}
}
...
@@ -98,16 +99,16 @@ inline bool is_zero<dim3>(dim3 size) {
...
@@ -98,16 +99,16 @@ inline bool is_zero<dim3>(dim3 size) {
auto allocator = graphbolt::cuda::GetAllocator(); \
auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
size_t workspace_size = 0; \
size_t workspace_size = 0; \
CUDA_CALL(cub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \
CUDA_CALL(
hip
cub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \
auto workspace = allocator.AllocateStorage<char>(workspace_size); \
auto workspace = allocator.AllocateStorage<char>(workspace_size); \
CUDA_CALL(cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
CUDA_CALL(
hip
cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
}
}
#define THRUST_CALL(fn, ...) \
#define THRUST_CALL(fn, ...) \
[&] { \
[&] { \
auto allocator = graphbolt::cuda::GetAllocator(); \
auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
const auto exec_policy = thrust::
cuda
::par_nosync(allocator).on(stream); \
const auto exec_policy = thrust::
hip
::par_nosync(allocator).on(stream); \
return thrust::fn(exec_policy, __VA_ARGS__); \
return thrust::fn(exec_policy, __VA_ARGS__); \
}()
}()
...
@@ -126,7 +127,7 @@ template <typename scalar_t>
...
@@ -126,7 +127,7 @@ template <typename scalar_t>
struct
CopyScalar
{
struct
CopyScalar
{
CopyScalar
()
:
is_ready_
(
true
)
{
init_pinned_storage
();
}
CopyScalar
()
:
is_ready_
(
true
)
{
init_pinned_storage
();
}
void
record
(
at
::
cuda
::
CUDAStream
stream
=
GetCurrentStream
())
{
void
record
(
at
::
hip
::
HIPStreamMasqueradingAsCUDA
stream
=
GetCurrentStream
())
{
copy_event_
.
record
(
stream
);
copy_event_
.
record
(
stream
);
is_ready_
=
false
;
is_ready_
=
false
;
}
}
...
@@ -138,9 +139,9 @@ struct CopyScalar {
...
@@ -138,9 +139,9 @@ struct CopyScalar {
CopyScalar
(
const
scalar_t
*
device_ptr
)
{
CopyScalar
(
const
scalar_t
*
device_ptr
)
{
init_pinned_storage
();
init_pinned_storage
();
auto
stream
=
GetCurrentStream
();
auto
stream
=
GetCurrentStream
();
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL
(
hip
MemcpyAsync
(
reinterpret_cast
<
scalar_t
*>
(
pinned_scalar_
.
data_ptr
()),
device_ptr
,
reinterpret_cast
<
scalar_t
*>
(
pinned_scalar_
.
data_ptr
()),
device_ptr
,
sizeof
(
scalar_t
),
cuda
MemcpyDeviceToHost
,
stream
));
sizeof
(
scalar_t
),
hip
MemcpyDeviceToHost
,
stream
));
record
(
stream
);
record
(
stream
);
}
}
...
...
graphbolt/src/cuda/cumsum.
cu
→
graphbolt/src/cuda/cumsum.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/cumsum.cu
* @file cuda/cumsum.cu
* @brief Cumsum operators implementation on CUDA.
* @brief Cumsum operators implementation on CUDA.
*/
*/
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "
./
common.h"
#include "common.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
...
graphbolt/src/cuda/expand_indptr.
cu
→
graphbolt/src/cuda/expand_indptr.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -8,10 +9,10 @@
...
@@ -8,10 +9,10 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <limits>
#include <limits>
#include "
./
common.h"
#include "common.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -86,7 +87,7 @@ torch::Tensor ExpandIndptrImpl(
...
@@ -86,7 +87,7 @@ torch::Tensor ExpandIndptrImpl(
CUB_CALL(
CUB_CALL(
DeviceCopy::Batched, input_buffer + i,
DeviceCopy::Batched, input_buffer + i,
output_buffer + i, buffer_sizes + i,
output_buffer + i, buffer_sizes + i,
std
::
min
(
num_rows
-
i
,
max_copy_at_once
));
::min(num_rows - i, max_copy_at_once));
}
}
}));
}));
}));
}));
...
...
graphbolt/src/cuda/gpu_cache.
cu
→
graphbolt/src/cuda/gpu_cache.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -6,8 +7,8 @@
...
@@ -6,8 +7,8 @@
*/
*/
#include <numeric>
#include <numeric>
#include "
./
common.h"
#include "common.h"
#include "
./
gpu_cache.h"
#include "gpu_cache.h"
namespace graphbolt {
namespace graphbolt {
namespace cuda {
namespace cuda {
...
...
graphbolt/src/cuda/index_select_csc_impl.
cu
→
graphbolt/src/cuda/index_select_csc_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -10,12 +12,12 @@
...
@@ -10,12 +12,12 @@
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <numeric>
#include <numeric>
#include "
./
common.h"
#include "common.h"
#include "
./
max_uva_threads.h"
#include "max_uva_threads.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
...
@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
const dim3 block(BLOCK_SIZE);
const dim3 block(BLOCK_SIZE);
const dim3 grid(
const dim3 grid(
(
std
::
min
(
edge_count_aligned
,
cuda
::
max_uva_threads
.
value_or
(
1
<<
20
))
+
(::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
BLOCK_SIZE - 1) /
BLOCK_SIZE - 1) /
BLOCK_SIZE);
BLOCK_SIZE);
...
@@ -220,7 +222,7 @@ void IndexSelectCSCCopyIndices(
...
@@ -220,7 +222,7 @@ void IndexSelectCSCCopyIndices(
for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
CUB_CALL(
CUB_CALL(
DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
buffer_sizes
+
i
,
std
::
min
(
num_nodes
-
i
,
max_copy_at_once
));
buffer_sizes + i, ::min(num_nodes - i, max_copy_at_once));
}
}
}
}
...
...
graphbolt/src/cuda/index_select_impl.
cu
→
graphbolt/src/cuda/index_select_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -9,9 +11,9 @@
...
@@ -9,9 +11,9 @@
#include <numeric>
#include <numeric>
#include "
./
common.h"
#include "common.h"
#include "
./
max_uva_threads.h"
#include "max_uva_threads.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -124,7 +126,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
...
@@ -124,7 +126,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
// Use a single thread to process each output row to avoid wasting threads.
// Use a single thread to process each output row to avoid wasting threads.
const int num_threads = cuda::FindNumThreads(return_len);
const int num_threads = cuda::FindNumThreads(return_len);
const int num_blocks =
const int num_blocks =
(
std
::
min
(
return_len
,
cuda
::
max_uva_threads
.
value_or
(
1
<<
20
))
+
(::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
num_threads - 1) /
num_threads - 1) /
num_threads;
num_threads;
CUDA_KERNEL_CALL(
CUDA_KERNEL_CALL(
...
@@ -137,7 +139,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
...
@@ -137,7 +139,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
block.x >>= 1;
block.x >>= 1;
block.y <<= 1;
block.y <<= 1;
}
}
const
dim3
grid
(
std
::
min
(
const dim3 grid(::min(
(return_len + block.y - 1) / block.y,
(return_len + block.y - 1) / block.y,
cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {
if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {
...
...
graphbolt/src/cuda/insubgraph.
cu
→
graphbolt/src/cuda/insubgraph.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -8,7 +9,7 @@
...
@@ -8,7 +9,7 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include "
./
common.h"
#include "common.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
...
graphbolt/src/cuda/isin.
cu
→
graphbolt/src/cuda/isin.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -7,7 +8,7 @@
...
@@ -7,7 +8,7 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <thrust/binary_search.h>
#include <thrust/binary_search.h>
#include "
./
common.h"
#include "common.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
...
graphbolt/src/cuda/max_uva_threads.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/max_uva_threads.cc
* @file cuda/max_uva_threads.cc
* @brief Max uva threads variable setter function.
* @brief Max uva threads variable setter function.
*/
*/
#include "
./
max_uva_threads.h"
#include "max_uva_threads.h"
namespace
graphbolt
{
namespace
graphbolt
{
namespace
cuda
{
namespace
cuda
{
...
...
graphbolt/src/cuda/neighbor_sampler.
cu
→
graphbolt/src/cuda/neighbor_sampler.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "hip/hip_bf16.h"
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -5,7 +8,7 @@
...
@@ -5,7 +8,7 @@
* @brief Index select operator implementation on CUDA.
* @brief Index select operator implementation on CUDA.
*/
*/
#include <c10/core/ScalarType.h>
#include <c10/core/ScalarType.h>
#include <
cu
rand_kernel.h>
#include <
hiprand/hip
rand_kernel.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include <thrust/gather.h>
#include <thrust/gather.h>
...
@@ -15,14 +18,14 @@
...
@@ -15,14 +18,14 @@
#include <algorithm>
#include <algorithm>
#include <array>
#include <array>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <limits>
#include <limits>
#include <numeric>
#include <numeric>
#include <type_traits>
#include <type_traits>
#include "../random.h"
#include "../random.h"
#include "
./
common.h"
#include "common.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -44,11 +47,11 @@ __global__ void _ComputeRandoms(
...
@@ -44,11 +47,11 @@ __global__ void _ComputeRandoms(
const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) {
const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) {
int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = gridDim.x * blockDim.x;
const int stride = gridDim.x * blockDim.x;
cu
randStatePhilox4_32_10_t
rng
;
hip
randStatePhilox4_32_10_t rng;
const auto labor = indices != nullptr;
const auto labor = indices != nullptr;
if (!labor) {
if (!labor) {
cu
rand_init
(
random_seed
,
i
,
0
,
&
rng
);
hip
rand_init(random_seed, i, 0, &rng);
}
}
while (i < num_edges) {
while (i < num_edges) {
...
@@ -58,10 +61,10 @@ __global__ void _ComputeRandoms(
...
@@ -58,10 +61,10 @@ __global__ void _ComputeRandoms(
if (labor) {
if (labor) {
constexpr uint64_t kCurandSeed = 999961;
constexpr uint64_t kCurandSeed = 999961;
cu
rand_init
(
kCurandSeed
,
random_seed
,
indices
[
in_idx
],
&
rng
);
hip
rand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
}
}
const
auto
rnd
=
cu
rand_uniform
(
&
rng
);
const auto rnd =
hip
rand_uniform(&rng);
const auto prob =
const auto prob =
sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
const auto exp_rnd = -__logf(rnd);
const auto exp_rnd = -__logf(rnd);
...
@@ -152,9 +155,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
...
@@ -152,9 +155,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
}
}
// Finally, copy the adjusted fanout values to the device memory.
// Finally, copy the adjusted fanout values to the device memory.
auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
CUDA_CALL
(
cuda
MemcpyAsync
(
CUDA_CALL(
hip
MemcpyAsync(
fanouts_device.get(), fanouts_pinned_ptr,
fanouts_device.get(), fanouts_pinned_ptr,
sizeof
(
int64_t
)
*
fanouts
.
size
(),
cuda
MemcpyHostToDevice
,
sizeof(int64_t) * fanouts.size(),
hip
MemcpyHostToDevice,
cuda::GetCurrentStream()));
cuda::GetCurrentStream()));
auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
...
@@ -271,7 +274,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
...
@@ -271,7 +274,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
"Selected edge_id_t must be capable of storing edge_ids.");
"Selected edge_id_t must be capable of storing edge_ids.");
// Using bfloat16 for random numbers works just as reliably as
// Using bfloat16 for random numbers works just as reliably as
// float32 and provides around %30 percent speedup.
// float32 and provides around %30 percent speedup.
using
rnd_t
=
nv
_bfloat16
;
using rnd_t =
__hip
_bfloat16;
auto randoms =
auto randoms =
allocator.AllocateStorage<rnd_t>(num_edges.value());
allocator.AllocateStorage<rnd_t>(num_edges.value());
auto randoms_sorted =
auto randoms_sorted =
...
@@ -362,7 +365,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
...
@@ -362,7 +365,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
CUB_CALL(
CUB_CALL(
DeviceCopy::Batched, input_buffer_it + i,
DeviceCopy::Batched, input_buffer_it + i,
output_buffer_it + i, sampled_degree + i,
output_buffer_it + i, sampled_degree + i,
std
::
min
(
num_rows
-
i
,
max_copy_at_once
));
::min(num_rows - i, max_copy_at_once));
}
}
}));
}));
...
...
graphbolt/src/cuda/sampling_utils.
cu
→
graphbolt/src/cuda/sampling_utils.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -7,10 +8,10 @@
...
@@ -7,10 +8,10 @@
#include <thrust/for_each.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "
./
common.h"
#include "common.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -72,7 +73,7 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
...
@@ -72,7 +73,7 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
CUB_CALL(
CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy,
DeviceAdjacentDifference::SubtractLeftCopy,
indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
num_nodes
+
1
,
cub
::
Difference
{});
num_nodes + 1,
hip
cub::Difference{});
}));
}));
in_degree = in_degree.slice(0, 1);
in_degree = in_degree.slice(0, 1);
return {in_degree, sliced_indptr};
return {in_degree, sliced_indptr};
...
@@ -126,7 +127,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
...
@@ -126,7 +127,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
CUB_CALL(
CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy,
DeviceAdjacentDifference::SubtractLeftCopy,
new_sub_indptr.data_ptr<indptr_t>(),
new_sub_indptr.data_ptr<indptr_t>(),
new_indegree
.
data_ptr
<
indptr_t
>
(),
num_rows
+
1
,
cub
::
Difference
{});
new_indegree.data_ptr<indptr_t>(), num_rows + 1,
hip
cub::Difference{});
}));
}));
// Discard the first element of the SubtractLeftCopy result and ensure that
// Discard the first element of the SubtractLeftCopy result and ensure that
// new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is
// new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is
...
...
graphbolt/src/cuda/sort_impl.
cu
→
graphbolt/src/cuda/sort_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -6,10 +7,10 @@
...
@@ -6,10 +7,10 @@
*/
*/
#include <c10/core/ScalarType.h>
#include <c10/core/ScalarType.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include "
./
common.h"
#include "common.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
...
graphbolt/src/cuda/unique_and_compact_impl.
cu
→
graphbolt/src/cuda/unique_and_compact_impl.
hip
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -10,11 +11,11 @@
...
@@ -10,11 +11,11 @@
#include <thrust/gather.h>
#include <thrust/gather.h>
#include <thrust/logical.h>
#include <thrust/logical.h>
#include <cub/cub.
cuh
>
#include <
hip
cub/
hip
cub.
hpp
>
#include <type_traits>
#include <type_traits>
#include "
./
common.h"
#include "common.h"
#include "
./
utils.h"
#include "utils.h"
namespace graphbolt {
namespace graphbolt {
namespace ops {
namespace ops {
...
@@ -97,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
...
@@ -97,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
// and max_id_dst.
// and max_id_dst.
if (num_bits == 0) {
if (num_bits == 0) {
num_bits = cuda::NumberOfBits(
num_bits = cuda::NumberOfBits(
1
+
std
::
max
(
1 + ::max(
static_cast<scalar_t>(max_id_src),
static_cast<scalar_t>(max_id_src),
static_cast<scalar_t>(max_id_dst)));
static_cast<scalar_t>(max_id_dst)));
}
}
...
...
graphbolt/src/expand_indptr.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...
@@ -6,8 +7,8 @@
...
@@ -6,8 +7,8 @@
*/
*/
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include "
./
macro.h"
#include "macro.h"
#include "
./
utils.h"
#include "utils.h"
namespace
graphbolt
{
namespace
graphbolt
{
namespace
ops
{
namespace
ops
{
...
...
graphbolt/src/fused_csc_sampling_graph.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* @file fused_csc_sampling_graph.cc
* @file fused_csc_sampling_graph.cc
...
@@ -17,10 +18,10 @@
...
@@ -17,10 +18,10 @@
#include <tuple>
#include <tuple>
#include <vector>
#include <vector>
#include "
./
macro.h"
#include "macro.h"
#include "
./
random.h"
#include "random.h"
#include "
./
shared_memory_helper.h"
#include "shared_memory_helper.h"
#include "
./
utils.h"
#include "utils.h"
namespace
{
namespace
{
torch
::
optional
<
torch
::
Dict
<
std
::
string
,
torch
::
Tensor
>>
TensorizeDict
(
torch
::
optional
<
torch
::
Dict
<
std
::
string
,
torch
::
Tensor
>>
TensorizeDict
(
...
...
graphbolt/src/index_select.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* @file index_select.cc
* @file index_select.cc
...
@@ -6,8 +7,8 @@
...
@@ -6,8 +7,8 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/fused_csc_sampling_graph.h>
#include <graphbolt/fused_csc_sampling_graph.h>
#include "
./
macro.h"
#include "macro.h"
#include "
./
utils.h"
#include "utils.h"
namespace
graphbolt
{
namespace
graphbolt
{
namespace
ops
{
namespace
ops
{
...
...
graphbolt/src/isin.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
*
*
...
@@ -8,8 +9,8 @@
...
@@ -8,8 +9,8 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/isin.h>
#include <graphbolt/isin.h>
#include "
./
macro.h"
#include "macro.h"
#include "
./
utils.h"
#include "utils.h"
namespace
{
namespace
{
static
constexpr
int
kSearchGrainSize
=
4096
;
static
constexpr
int
kSearchGrainSize
=
4096
;
...
...
graphbolt/src/python_binding.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* @file python_binding.cc
* @file python_binding.cc
...
@@ -10,14 +11,14 @@
...
@@ -10,14 +11,14 @@
#include <graphbolt/unique_and_compact.h>
#include <graphbolt/unique_and_compact.h>
#ifdef GRAPHBOLT_USE_CUDA
#ifdef GRAPHBOLT_USE_CUDA
#include "
./
cuda/max_uva_threads.h"
#include "cuda/max_uva_threads.h"
#endif
#endif
#include "
./
expand_indptr.h"
#include "expand_indptr.h"
#include "
./
index_select.h"
#include "index_select.h"
#include "
./
random.h"
#include "random.h"
#ifdef GRAPHBOLT_USE_CUDA
#ifdef GRAPHBOLT_USE_CUDA
#include "
./
cuda/gpu_cache.h"
#include "cuda/gpu_cache.h"
#endif
#endif
namespace
graphbolt
{
namespace
graphbolt
{
...
...
graphbolt/src/random.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
* @file random.cc
* @file random.cc
* @brief Random Engine.
* @brief Random Engine.
*/
*/
#include "
./
random.h"
#include "random.h"
#include <torch/torch.h>
#include <torch/torch.h>
...
...
graphbolt/src/shared_memory_helper.cc
View file @
6ac701f8
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023 by Contributors
*
*
* @file shared_memory_helper.cc
* @file shared_memory_helper.cc
* @brief Share memory helper implementation.
* @brief Share memory helper implementation.
*/
*/
#include "
./
shared_memory_helper.h"
#include "shared_memory_helper.h"
#include <graphbolt/serialize.h>
#include <graphbolt/serialize.h>
#include <graphbolt/shared_memory.h>
#include <graphbolt/shared_memory.h>
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment