Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
975559ee
Commit
975559ee
authored
Apr 22, 2025
by
Graylatzhou
Browse files
Issue/183 Mul算子CPU&CUDA
parent
d54ee0fb
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
823 additions
and
1 deletion
+823
-1
include/infiniop.h
include/infiniop.h
+1
-0
include/infiniop/ops/mul.h
include/infiniop/ops/mul.h
+26
-0
src/infiniop-test/include/ops.hpp
src/infiniop-test/include/ops.hpp
+2
-0
src/infiniop-test/src/ops/mul.cpp
src/infiniop-test/src/ops/mul.cpp
+112
-0
src/infiniop/elementwise/cuda/elementwise_cuda.cuh
src/infiniop/elementwise/cuda/elementwise_cuda.cuh
+2
-1
src/infiniop/ops/mul/cpu/mul_cpu.cc
src/infiniop/ops/mul/cpu/mul_cpu.cc
+51
-0
src/infiniop/ops/mul/cpu/mul_cpu.h
src/infiniop/ops/mul/cpu/mul_cpu.h
+19
-0
src/infiniop/ops/mul/cuda/mul_cuda.cu
src/infiniop/ops/mul/cuda/mul_cuda.cu
+57
-0
src/infiniop/ops/mul/cuda/mul_cuda.cuh
src/infiniop/ops/mul/cuda/mul_cuda.cuh
+8
-0
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
+26
-0
src/infiniop/ops/mul/operator.cc
src/infiniop/ops/mul/operator.cc
+120
-0
test/infiniop-test/test_generate/testcases/mul.py
test/infiniop-test/test_generate/testcases/mul.py
+156
-0
test/infiniop/mul.py
test/infiniop/mul.py
+243
-0
No files found.
include/infiniop.h
View file @
975559ee
...
...
@@ -19,5 +19,6 @@
#include "infiniop/ops/rope.h"
#include "infiniop/ops/swiglu.h"
#include "infiniop/tensor_descriptor.h"
#include "infiniop/ops/mul.h"
#endif // __INFINIOP_API_H__
include/infiniop/ops/mul.h
0 → 100644
View file @
975559ee
#ifndef __INFINIOP_MUL_API_H__
#define __INFINIOP_MUL_API_H__
#include "../operator_descriptor.h"
typedef
struct
InfiniopDescriptor
*
infiniopMulDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateMulDescriptor
(
infiniopHandle_t
handle
,
infiniopMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c
,
infiniopTensorDescriptor_t
a
,
infiniopTensorDescriptor_t
b
);
__C
__export
infiniStatus_t
infiniopGetMulWorkspaceSize
(
infiniopMulDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopMul
(
infiniopMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyMulDescriptor
(
infiniopMulDescriptor_t
desc
);
#endif
\ No newline at end of file
src/infiniop-test/include/ops.hpp
View file @
975559ee
...
...
@@ -7,6 +7,7 @@
*/
DECLARE_INFINIOP_TEST
(
gemm
)
DECLARE_INFINIOP_TEST
(
random_sample
)
DECLARE_INFINIOP_TEST
(
mul
)
#define REGISTER_INFINIOP_TEST(name) \
{ \
...
...
@@ -24,6 +25,7 @@ DECLARE_INFINIOP_TEST(random_sample)
{ \
REGISTER_INFINIOP_TEST(gemm) \
REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(mul) \
}
namespace
infiniop_test
{
...
...
src/infiniop-test/src/ops/mul.cpp
0 → 100644
View file @
975559ee
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
mul
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
a
;
std
::
shared_ptr
<
Tensor
>
b
;
std
::
shared_ptr
<
Tensor
>
c
;
std
::
shared_ptr
<
Tensor
>
ans
;
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
if
(
tensors
.
find
(
"a"
)
==
tensors
.
end
()
||
tensors
.
find
(
"b"
)
==
tensors
.
end
()
||
tensors
.
find
(
"c"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid Test"
);
}
test
->
_attributes
->
a
=
tensors
[
"a"
];
test
->
_attributes
->
b
=
tensors
[
"b"
];
test
->
_attributes
->
c
=
tensors
[
"c"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopMulDescriptor_t
op_desc
;
auto
a
=
_attributes
->
a
->
to
(
device
,
device_id
);
auto
b
=
_attributes
->
b
->
to
(
device
,
device_id
);
auto
c
=
_attributes
->
c
->
to
(
device
,
device_id
);
CHECK_OR
(
infiniopCreateMulDescriptor
(
handle
,
&
op_desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create op descriptor."
));
size_t
workspace_size
;
CHECK_OR
(
infiniopGetMulWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
CHECK_OR
(
infiniopMul
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
),
return
TEST_FAILED
(
OP_EXECUTION_FAILED
,
"Failed during execution."
));
try
{
allClose
(
c
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
double
elapsed_time
=
0.
;
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopMul
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
);
infiniopMul
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
);
},
(
warm_ups
+
1
)
/
2
,
(
iterations
+
1
)
/
2
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
// MUL 操作不需要特殊属性(与 GEMM 不同,GEMM 需要 alpha 和 beta)
return
{};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"a"
,
"b"
,
"c"
,
"ans"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- a: "
<<
_attributes
->
a
->
info
()
<<
std
::
endl
;
oss
<<
"- b: "
<<
_attributes
->
b
->
info
()
<<
std
::
endl
;
oss
<<
"- c: "
<<
_attributes
->
c
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::mul
\ No newline at end of file
src/infiniop/elementwise/cuda/elementwise_cuda.cuh
View file @
975559ee
...
...
@@ -359,7 +359,8 @@ private:
d_input_shapes
,
d_input_strides
,
stream
));
dim3
blockDims
(
std
::
min
(
BLOCK_SIZE
,
static_cast
<
uint32_t
>
(
internal
->
maxThreadsPerBlock
())));
dim3
gridDims
(
std
::
min
(
uint32_t
(
CEIL_DIV
(
output_size
,
blockDims
.
x
)),
static_cast
<
uint32_t
>
(
internal
->
gridSizeX
())));
dim3
gridDims
(
std
::
min
(
CEIL_DIV
(
output_size
,
blockDims
.
x
),
static_cast
<
size_t
>
(
internal
->
gridSizeX
())));
size_t
step
=
gridDims
.
x
*
blockDims
.
x
;
for
(
size_t
i
=
0
;
i
<
output_size
;
i
+=
step
)
{
...
...
src/infiniop/ops/mul/cpu/mul_cpu.cc
0 → 100644
View file @
975559ee
#include "mul_cpu.h"
namespace
op
::
mul
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
out_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
out_shape
,
a_shape
,
b_shape
);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
MulOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
MulOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
MulOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
src/infiniop/ops/mul/cpu/mul_cpu.h
0 → 100644
View file @
975559ee
#ifndef __MUL_CPU_H__
#define __MUL_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
mul
,
cpu
)
namespace
op
::
mul
::
cpu
{
typedef
struct
MulOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
*
b
;
}
}
MulOp
;
}
// namespace op::mul::cpu
#endif // __MUL_CPU_H__
\ No newline at end of file
src/infiniop/ops/mul/cuda/mul_cuda.cu
0 → 100644
View file @
975559ee
#include "mul_cuda.cuh"
#include "mul_cuda_internal.cuh"
namespace
op
::
mul
::
cuda
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
c_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
MulOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
MulOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
MulOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::add::cuda
\ No newline at end of file
src/infiniop/ops/mul/cuda/mul_cuda.cuh
0 → 100644
View file @
975559ee
#ifndef __MUL_CUDA_API_H__
#define __MUL_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
mul
,
cuda
)
#endif // __MUL_CUDA_API_H__
\ No newline at end of file
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
0 → 100644
View file @
975559ee
#ifndef __MUL_CUDA_H__
#define __MUL_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace
op
::
mul
::
cuda
{
typedef
struct
MulOp
{
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmul2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
a
,
b
);
}
else
{
return
a
*
b
;
}
}
}
MulOp
;
}
// namespace op::add::cuda
#endif // __MUL_CUDA_H__
\ No newline at end of file
src/infiniop/ops/mul/operator.cc
0 → 100644
View file @
975559ee
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/mul.h"
#ifdef ENABLE_CPU_API
#include "cpu/mul_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/mul_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateMulDescriptor
(
infiniopHandle_t
handle
,
infiniopMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::mul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::mul::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, \
b_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetMulWorkspaceSize
(
infiniopMulDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopMul
(
infiniopMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyMulDescriptor
(
infiniopMulDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
\ No newline at end of file
test/infiniop-test/test_generate/testcases/mul.py
0 → 100644
View file @
975559ee
from
ast
import
List
import
numpy
as
np
import
gguf
from
typing
import
List
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
def
mul
(
a
:
np
.
ndarray
,
b
:
np
.
ndarray
,
c
:
np
.
ndarray
=
None
,
):
if
c
is
None
:
return
np
.
multiply
(
a
,
b
)
return
np
.
multiply
(
a
,
b
,
out
=
c
)
def
random_tensor
(
shape
,
dtype
):
rate
=
1e-3
var
=
0.5
*
rate
# 数值范围在[-5e-4, 5e-4]
return
rate
*
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
-
var
class
MulTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
a
:
np
.
ndarray
,
stride_a
:
List
[
int
]
|
None
,
b
:
np
.
ndarray
,
stride_b
:
List
[
int
]
|
None
,
c
:
np
.
ndarray
,
stride_c
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"mul"
)
self
.
a
=
a
self
.
stride_a
=
stride_a
self
.
b
=
b
self
.
stride_b
=
stride_b
self
.
c
=
c
self
.
stride_c
=
stride_c
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
stride_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.strides"
),
self
.
stride_a
)
if
self
.
stride_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.strides"
),
self
.
stride_b
)
if
self
.
stride_c
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.strides"
),
self
.
stride_c
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"a"
),
self
.
a
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
a
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"b"
),
self
.
b
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
b
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"c"
),
self
.
c
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
c
.
dtype
)
)
ans
=
mul
(
self
.
a
,
self
.
b
,
self
.
c
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
np_dtype_to_ggml
(
ans
.
dtype
)
)
if
__name__
==
'__main__'
:
test_writer
=
InfiniopTestWriter
(
"mul.gguf"
)
test_cases
=
[
MulTestCase
(
random_tensor
((
2
,
3
),
np
.
float32
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float32
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float32
),
gguf_strides
(
3
,
1
),
),
MulTestCase
(
random_tensor
((
2
,
3
),
np
.
float16
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float16
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float16
),
gguf_strides
(
3
,
1
),
),
MulTestCase
(
random_tensor
((
2
,
3
),
np
.
float64
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float64
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float64
),
gguf_strides
(
3
,
1
),
),
MulTestCase
(
random_tensor
((
4
,
6
),
np
.
float16
),
gguf_strides
(
1
,
4
),
random_tensor
((
4
,
6
),
np
.
float16
),
gguf_strides
(
1
,
5
),
random_tensor
((
4
,
6
),
np
.
float16
),
gguf_strides
(
1
,
4
),
),
MulTestCase
(
random_tensor
((
1
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
2048
),
random_tensor
((
1
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
2048
),
random_tensor
((
1
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
2048
),
),
MulTestCase
(
random_tensor
((
2048
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2048
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2048
,
2048
),
np
.
float32
),
None
,
),
MulTestCase
(
random_tensor
((
2
,
4
,
2048
),
np
.
float16
),
None
,
random_tensor
((
2
,
4
,
2048
),
np
.
float16
),
None
,
random_tensor
((
2
,
4
,
2048
),
np
.
float16
),
None
,
),
MulTestCase
(
random_tensor
((
2
,
4
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2
,
4
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2
,
4
,
2048
),
np
.
float32
),
None
,
),
MulTestCase
(
random_tensor
((
2048
,
2560
),
np
.
float32
),
gguf_strides
(
1
,
2560
),
random_tensor
((
2048
,
2560
),
np
.
float32
),
gguf_strides
(
1
,
2560
),
random_tensor
((
2048
,
2560
),
np
.
float32
),
gguf_strides
(
1
,
2560
),
),
MulTestCase
(
random_tensor
((
4
,
48
,
64
),
np
.
float16
),
None
,
random_tensor
((
4
,
48
,
64
),
np
.
float16
),
None
,
random_tensor
((
4
,
48
,
64
),
np
.
float16
),
None
),
MulTestCase
(
random_tensor
((
4
,
48
,
64
),
np
.
float32
),
None
,
random_tensor
((
4
,
48
,
64
),
np
.
float32
),
None
,
random_tensor
((
4
,
48
,
64
),
np
.
float32
),
None
),
]
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop/mul.py
0 → 100644
View file @
975559ee
import
torch
import
ctypes
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_void_p
,
c_uint64
from
libinfiniop
import
(
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
open_lib
,
to_tensor
,
get_test_devices
,
check_error
,
rearrange_if_needed
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
create_workspace
,
)
from
enum
import
Enum
,
auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
# shape, a_stride, b_stride, c_stride
((
13
,
4
),
None
,
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
),
(
0
,
1
),
None
,
None
),
((
13
,
4
,
4
),
None
,
None
,
None
),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
13
,
4
,
4
),
(
4
,
0
,
1
),
(
0
,
4
,
1
),
None
),
((
16
,
5632
),
None
,
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
),
(
13312
,
1
)),
((
4
,
4
,
5632
),
None
,
None
,
None
),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
]
class
Inplace
(
Enum
):
OUT_OF_PLACE
=
auto
()
INPLACE_A
=
auto
()
INPLACE_B
=
auto
()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE
=
[
Inplace
.
OUT_OF_PLACE
,
Inplace
.
INPLACE_A
,
Inplace
.
INPLACE_B
,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES
=
[
test_case
+
(
inplace_item
,)
for
test_case
in
_TEST_CASES_
for
inplace_item
in
_INPLACE
]
# Data types used for testing
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
1e-4
,
"rtol"
:
1e-2
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
MulDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
infiniopMulDescriptor_t
=
POINTER
(
MulDescriptor
)
def
mul
(
x
,
y
):
return
torch
.
mul
(
x
,
y
)
def
process_tensors
(
c
,
c_strides
,
a
,
a_stride
,
b
,
b_stride
,
inplace
):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides
=
c_strides
if
c_strides
else
c
.
stride
()
def
_rearrange
(
tensor
,
strides
):
if
strides
and
0
in
strides
:
tensor
.
set_
(
tensor
.
untyped_storage
(),
0
,
tensor
.
shape
,
strides
)
return
tensor
else
:
return
rearrange_if_needed
(
tensor
,
strides
)
a
,
b
,
c
=
[
_rearrange
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
a
,
b
,
c
],
[
a_stride
,
b_stride
,
c_strides
])
]
c
=
(
c
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if
0
in
c
.
stride
():
c
.
set_
(
c
.
untyped_storage
(),
0
,
c
.
shape
,
original_c_strides
)
return
a
,
b
,
c
def
test
(
lib
,
handle
,
torch_device
,
shape
,
a_stride
=
None
,
b_stride
=
None
,
c_stride
=
None
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float16
,
sync
=
None
,
):
print
(
f
"Testing Mul on
{
torch_device
}
with shape:
{
shape
}
a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
"
f
"dtype:
{
dtype
}
inplace:
{
inplace
}
"
)
a
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
c
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
a
,
b
,
c
=
process_tensors
(
c
,
c_stride
,
a
,
a_stride
,
b
,
b_stride
,
inplace
)
ans
=
mul
(
a
,
b
)
a_tensor
,
b_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
]]
c_tensor
=
(
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopMulDescriptor_t
()
check_error
(
lib
.
infiniopCreateMulDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
c_tensor
.
descriptor
,
a_tensor
.
descriptor
,
b_tensor
.
descriptor
,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
a_tensor
,
b_tensor
,
c_tensor
]:
tensor
.
destroyDesc
(
lib
)
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetMulWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
create_workspace
(
workspace_size
.
value
,
c
.
device
)
def
lib_mul
():
check_error
(
lib
.
infiniopMul
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
c_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
None
,
)
)
lib_mul
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
mul
(
a
,
b
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_mul
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
lib
.
infiniopDestroyMulDescriptor
(
descriptor
))
if
__name__
==
"__main__"
:
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateMulDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateMulDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopMulDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetMulWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetMulWorkspaceSize
.
argtypes
=
[
infiniopMulDescriptor_t
,
POINTER
(
c_uint64
),
]
lib
.
infiniopMul
.
restype
=
c_int32
lib
.
infiniopMul
.
argtypes
=
[
infiniopMulDescriptor_t
,
c_void_p
,
c_uint64
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroyMulDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyMulDescriptor
.
argtypes
=
[
infiniopMulDescriptor_t
,
]
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment