Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
92ad2426
Unverified
Commit
92ad2426
authored
Mar 10, 2025
by
PanZezhong1725
Committed by
GitHub
Mar 10, 2025
Browse files
Merge pull request #95 from YdrMaster/main
issue/87/feat: cublas 和 cudnn 检查并返回错误信息
parents
d5422e5b
911115fb
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
191 additions
and
63 deletions
+191
-63
src/infiniop/devices/cuda/cuda_common.cu
src/infiniop/devices/cuda/cuda_common.cu
+92
-0
src/infiniop/devices/cuda/cuda_common.cuh
src/infiniop/devices/cuda/cuda_common.cuh
+43
-0
src/infiniop/devices/cuda/cuda_handle.cuh
src/infiniop/devices/cuda/cuda_handle.cuh
+28
-37
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+28
-26
No files found.
src/infiniop/devices/cuda/cuda_
handle
.cu
→
src/infiniop/devices/cuda/cuda_
common
.cu
View file @
92ad2426
...
...
@@ -4,35 +4,56 @@ namespace device::cuda {
Handle
::
Handle
(
infiniDevice_t
device
,
int
device_id
)
:
InfiniopHandle
{
device
,
device_id
},
_internal
(
std
::
make_shared
<
Handle
::
Internal
>
())
{}
_internal
(
std
::
make_shared
<
Handle
::
Internal
>
(
device_id
))
{}
auto
Handle
::
internal
()
const
->
const
std
::
shared_ptr
<
Internal
>
&
{
return
_internal
;
}
template
<
typename
T
>
using
Fn
=
std
::
function
<
void
(
T
)
>
;
Handle
::
Internal
::
Internal
(
int
device_id
)
{
cudaDeviceProp
prop
;
cudaGetDeviceProperties
(
&
prop
,
device_id
);
_warp_size
=
prop
.
warpSize
;
_max_threads_per_block
=
prop
.
maxThreadsPerBlock
;
_block_size
[
0
]
=
prop
.
maxThreadsDim
[
0
];
_block_size
[
1
]
=
prop
.
maxThreadsDim
[
1
];
_block_size
[
2
]
=
prop
.
maxThreadsDim
[
2
];
_grid_size
[
0
]
=
prop
.
maxGridSize
[
0
];
_grid_size
[
1
]
=
prop
.
maxGridSize
[
1
];
_grid_size
[
2
]
=
prop
.
maxGridSize
[
2
];
}
void
Handle
::
Internal
::
use
_c
ublas
(
cudaStream_t
stream
,
const
Fn
<
cublasHandle_t
>
&
f
)
const
{
infiniStatus_t
Handle
::
Internal
::
use
C
ublas
(
cudaStream_t
stream
,
const
Fn
<
cublasHandle_t
>
&
f
)
const
{
auto
handle
=
blas_handles
.
pop
();
if
(
!
handle
)
{
cublasCreate
(
&
(
*
handle
));
CHECK_CUBLAS
(
cublasCreate
(
&
(
*
handle
))
)
;
}
cublasSetStream
(
*
handle
,
stream
);
f
(
*
handle
);
CHECK_CUBLAS
(
cublasSetStream
(
*
handle
,
stream
)
)
;
CHECK_STATUS
(
f
(
*
handle
)
)
;
blas_handles
.
push
(
std
::
move
(
*
handle
));
return
INFINI_STATUS_SUCCESS
;
}
void
Handle
::
Internal
::
use
_c
udnn
(
cudaStream_t
stream
,
const
Fn
<
cudnnHandle_t
>
&
f
)
const
{
infiniStatus_t
Handle
::
Internal
::
use
C
udnn
(
cudaStream_t
stream
,
const
Fn
<
cudnnHandle_t
>
&
f
)
const
{
auto
handle
=
dnn_handles
.
pop
();
if
(
!
handle
)
{
cudnnCreate
(
&
(
*
handle
));
CHECK_CUDNN
(
cudnnCreate
(
&
(
*
handle
))
)
;
}
cudnnSetStream
(
*
handle
,
stream
);
f
(
*
handle
);
CHECK_CUDNN
(
cudnnSetStream
(
*
handle
,
stream
)
)
;
CHECK_STATUS
(
f
(
*
handle
)
)
;
dnn_handles
.
push
(
std
::
move
(
*
handle
));
return
INFINI_STATUS_SUCCESS
;
}
int
Handle
::
Internal
::
warpSize
()
const
{
return
_warp_size
;
}
int
Handle
::
Internal
::
maxThreadsPerBlock
()
const
{
return
_max_threads_per_block
;
}
int
Handle
::
Internal
::
blockSizeX
()
const
{
return
_block_size
[
0
];
}
int
Handle
::
Internal
::
blockSizeY
()
const
{
return
_block_size
[
1
];
}
int
Handle
::
Internal
::
blockSizeZ
()
const
{
return
_block_size
[
2
];
}
int
Handle
::
Internal
::
gridSizeX
()
const
{
return
_grid_size
[
0
];
}
int
Handle
::
Internal
::
gridSizeY
()
const
{
return
_grid_size
[
1
];
}
int
Handle
::
Internal
::
gridSizeZ
()
const
{
return
_grid_size
[
2
];
}
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
)
{
switch
(
dt
)
{
case
INFINI_DTYPE_F16
:
...
...
src/infiniop/devices/cuda/cuda_common.cuh
0 → 100644
View file @
92ad2426
#ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__
#include "cuda_handle.cuh"
#include "infinicore.h"
namespace
device
::
cuda
{
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
}
// namespace device::cuda
#endif // __INFINIOP_CUDA_COMMON_CUH__
src/infiniop/devices/cuda/cuda_handle.cuh
View file @
92ad2426
#ifndef __INFINIOP_CUDA_
INTERNAL_
H__
#define __INFINIOP_CUDA_
INTERNAL_
H__
#ifndef __INFINIOP_CUDA_
HANDLE_CU
H__
#define __INFINIOP_CUDA_
HANDLE_CU
H__
#include "../../../utils.h"
#include "../pool.h"
#include "cuda_handle.h"
#include <cublas_v2.h>
#include <cudnn.h>
#include <functional>
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
namespace
device
::
cuda
{
class
Handle
::
Internal
{
Pool
<
cublasHandle_t
>
blas_handles
;
Pool
<
cudnnHandle_t
>
dnn_handles
;
int
_warp_size
,
_max_threads_per_block
,
_block_size
[
3
],
_grid_size
[
3
];
template
<
typename
T
>
using
Fn
=
std
::
function
<
infiniStatus_t
(
T
)
>
;
public:
void
use_cublas
(
cudaStream_t
stream
,
const
std
::
function
<
void
(
cublasHandle_t
)
>
&
f
)
const
;
void
use_cudnn
(
cudaStream_t
stream
,
const
std
::
function
<
void
(
cudnnHandle_t
)
>
&
f
)
const
;
Internal
(
int
);
infiniStatus_t
useCublas
(
cudaStream_t
stream
,
const
Fn
<
cublasHandle_t
>
&
f
)
const
;
infiniStatus_t
useCudnn
(
cudaStream_t
stream
,
const
Fn
<
cudnnHandle_t
>
&
f
)
const
;
int
warpSize
()
const
;
int
maxThreadsPerBlock
()
const
;
int
blockSizeX
()
const
;
int
blockSizeY
()
const
;
int
blockSizeZ
()
const
;
int
gridSizeX
()
const
;
int
gridSizeY
()
const
;
int
gridSizeZ
()
const
;
};
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
}
// namespace device::cuda
#endif // __INFINIOP_CUDA_
INTERNAL_
H__
#endif // __INFINIOP_CUDA_
HANDLE_CU
H__
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
View file @
92ad2426
...
...
@@ -76,34 +76,36 @@ infiniStatus_t Descriptor::calculate(
auto
op_a
=
_info
.
a_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_b
=
_info
.
b_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
_opaque
->
internal
->
use
_c
ublas
(
CHECK_STATUS
(
_opaque
->
internal
->
use
C
ublas
(
(
cudaStream_t
)
stream
,
[
&
](
cublasHandle_t
handle
)
{
cublasGemmStridedBatchedEx
(
handle
,
op_a
,
op_b
,
static_cast
<
int
>
(
_info
.
m
),
static_cast
<
int
>
(
_info
.
n
),
static_cast
<
int
>
(
_info
.
k
),
&
alpha
,
a
,
a_type
,
static_cast
<
int
>
(
_info
.
a_matrix
.
ld
()),
_info
.
a_matrix
.
stride
,
b
,
b_type
,
static_cast
<
int
>
(
_info
.
b_matrix
.
ld
()),
_info
.
b_matrix
.
stride
,
&
beta
,
c
,
c_type
,
static_cast
<
int
>
(
_info
.
c_matrix
.
ld
()),
_info
.
c_matrix
.
stride
,
static_cast
<
int
>
(
_info
.
batch
),
compute_type
,
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
});
CHECK_CUBLAS
(
cublasGemmStridedBatchedEx
(
handle
,
op_a
,
op_b
,
static_cast
<
int
>
(
_info
.
m
),
static_cast
<
int
>
(
_info
.
n
),
static_cast
<
int
>
(
_info
.
k
),
&
alpha
,
a
,
a_type
,
static_cast
<
int
>
(
_info
.
a_matrix
.
ld
()),
_info
.
a_matrix
.
stride
,
b
,
b_type
,
static_cast
<
int
>
(
_info
.
b_matrix
.
ld
()),
_info
.
b_matrix
.
stride
,
&
beta
,
c
,
c_type
,
static_cast
<
int
>
(
_info
.
c_matrix
.
ld
()),
_info
.
c_matrix
.
stride
,
static_cast
<
int
>
(
_info
.
batch
),
compute_type
,
CUBLAS_GEMM_DEFAULT_TENSOR_OP
));
return
INFINI_STATUS_SUCCESS
;
}));
return
INFINI_STATUS_SUCCESS
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment