Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
2f2a74b6
Commit
2f2a74b6
authored
Mar 24, 2025
by
Zimin Li
Browse files
Merge remote-tracking branch 'upstream/main'
parents
1d95ddf3
70806eed
Changes
80
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
755 additions
and
560 deletions
+755
-560
src/infiniop/ops/random_sample/random_sample.h
src/infiniop/ops/random_sample/random_sample.h
+56
-0
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
+95
-0
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
+8
-0
src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+37
-0
src/infiniop/ops/rms_norm/operator.cc
src/infiniop/ops/rms_norm/operator.cc
+11
-19
src/infiniop/ops/rms_norm/rms_norm.h
src/infiniop/ops/rms_norm/rms_norm.h
+4
-0
src/infiniop/reduce/cuda/reduce.cuh
src/infiniop/reduce/cuda/reduce.cuh
+26
-0
src/utils-test/test_rearrange.cc
src/utils-test/test_rearrange.cc
+10
-12
src/utils/check.h
src/utils/check.h
+13
-0
src/utils/rearrange.cc
src/utils/rearrange.cc
+1
-0
test/infiniop-test/README.md
test/infiniop-test/README.md
+67
-0
test/infiniop-test/test_generate/__init__.py
test/infiniop-test/test_generate/__init__.py
+1
-0
test/infiniop-test/test_generate/infiniop_test.py
test/infiniop-test/test_generate/infiniop_test.py
+68
-0
test/infiniop-test/test_generate/testcases/__init__.py
test/infiniop-test/test_generate/testcases/__init__.py
+0
-0
test/infiniop-test/test_generate/testcases/gemm.py
test/infiniop-test/test_generate/testcases/gemm.py
+196
-0
test/infiniop/gemm.py
test/infiniop/gemm.py
+122
-293
test/infiniop/matmul.py
test/infiniop/matmul.py
+0
-212
test/infiniop/random_sample.py
test/infiniop/random_sample.py
+12
-23
xmake.lua
xmake.lua
+5
-1
xmake/test.lua
xmake/test.lua
+23
-0
No files found.
src/infiniop/ops/random_sample/random_sample.h
0 → 100644
View file @
2f2a74b6
#ifndef __RANDOM_SAMPLE_H__
#define __RANDOM_SAMPLE_H__
#include "../../../utils.h"
#include "../../operator.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::random_sample::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
\
infiniDtype_t _dt_i, _dt_p; \
size_t _n, _min_workspace_size; \
\
Descriptor( \
infiniDtype_t dt_i, \
infiniDtype_t dt_p, \
size_t n, \
size_t min_workspace_size, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dt_i(dt_i), \
_dt_p(dt_p), \
_n(n), \
_min_workspace_size(min_workspace_size) {} \
\
public: \
~Descriptor(); \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t result_desc, \
infiniopTensorDescriptor_t probs_desc); \
\
size_t minWorkspaceSize() const; \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *result, \
const void *probs, \
float random_val, \
float topp, \
int topk, \
float temperature, \
void *stream) const; \
}; \
}
#endif // __RANDOM_SAMPLE_H__
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
0 → 100644
View file @
2f2a74b6
#include "../../../devices/cuda/cuda_common.cuh"
#include "rms_norm_cuda.cuh"
#include "rms_norm_kernel.cuh"
#include <memory>
#include <stdint.h>
namespace
op
::
rms_norm
::
cuda
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
float
epsilon
)
{
RMSNormInfo
info
;
CHECK_STATUS
(
createRMSNormInfo
(
&
info
,
y_desc
,
x_desc
,
w_desc
,
epsilon
));
// only support contiguous last dimension
if
(
info
.
x_strides
[
1
]
!=
1
||
info
.
y_strides
[
1
]
!=
1
)
{
return
INFINI_STATUS_BAD_TENSOR_STRIDES
;
}
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle
)
->
internal
()},
info
,
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
// launch kernel with different data types
template
<
unsigned
int
BLOCK_SIZE
>
infiniStatus_t
launchKernel
(
uint32_t
batch_size
,
size_t
dim
,
void
*
y
,
infiniDtype_t
atype
,
ptrdiff_t
stride_y
,
const
void
*
x
,
ptrdiff_t
stride_x
,
const
void
*
w
,
infiniDtype_t
wtype
,
float
epsilon
,
cudaStream_t
cuda_stream
)
{
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
epsilon)
if
(
atype
==
INFINI_DTYPE_F16
&&
wtype
==
INFINI_DTYPE_F16
)
{
LAUNCH_KERNEL
(
half
,
half
,
float
);
}
else
if
(
atype
==
INFINI_DTYPE_F16
&&
wtype
==
INFINI_DTYPE_F32
)
{
LAUNCH_KERNEL
(
half
,
float
,
float
);
}
else
if
(
atype
==
INFINI_DTYPE_F32
&&
wtype
==
INFINI_DTYPE_F32
)
{
LAUNCH_KERNEL
(
float
,
float
,
float
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
#undef LAUNCH_KERNEL
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
void
*
stream
)
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
auto
stride_x
=
_info
.
x_strides
[
0
];
auto
stride_y
=
_info
.
y_strides
[
0
];
auto
dim
=
_info
.
dim
();
uint32_t
batch_size
=
static_cast
<
uint32_t
>
(
_info
.
shape
[
0
]);
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
// launch kernel with different block sizes
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_1024
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
batch_size
,
dim
,
y
,
_info
.
atype
,
stride_y
,
x
,
stride_x
,
w
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_512
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
batch_size
,
dim
,
y
,
_info
.
atype
,
stride_y
,
x
,
stride_x
,
w
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::rms_norm::cuda
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
0 → 100644
View file @
2f2a74b6
#ifndef __RMS_NORM_CUDA_H__
#define __RMS_NORM_CUDA_H__
#include "../rms_norm.h"
DESCRIPTOR
(
cuda
)
#endif
src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
0 → 100644
View file @
2f2a74b6
#ifndef __RMS_NORM_CUDA_KERNEL_H__
#define __RMS_NORM_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_common.cuh"
#include <cub/block/block_reduce.cuh>
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tweight
,
typename
Tcompute
>
INFINIOP_CUDA_KERNEL
rmsnormBlock
(
Tdata
*
__restrict__
y
,
ptrdiff_t
stride_y
,
const
Tdata
*
__restrict__
x
,
ptrdiff_t
stride_x
,
const
Tweight
*
__restrict__
w
,
size_t
dim
,
float
epsilon
)
{
// Each block takes care of a row of continuous data of length dim
// Each thread deals with every block_size element in the row
auto
y_ptr
=
y
+
blockIdx
.
x
*
stride_y
;
auto
x_ptr
=
x
+
blockIdx
.
x
*
stride_x
;
auto
w_ptr
=
w
;
// Block-reduce sum of x^2
Tcompute
ss
=
op
::
common_cuda
::
reduce_op
::
sumSquared
<
BLOCK_SIZE
,
Tdata
,
Tcompute
>
(
x_ptr
,
dim
);
// Thread_0 computes RMS=1/sqrt(ss/dim+epsilon) and stores in shared memory
__shared__
Tcompute
rms
;
if
(
threadIdx
.
x
==
0
)
{
rms
=
Tdata
(
rsqrtf
(
ss
/
Tcompute
(
dim
)
+
epsilon
));
}
__syncthreads
();
for
(
size_t
i
=
threadIdx
.
x
;
i
<
dim
;
i
+=
BLOCK_SIZE
)
{
y_ptr
[
i
]
=
Tdata
(
Tcompute
(
x_ptr
[
i
])
*
Tcompute
(
w_ptr
[
i
])
*
rms
);
}
}
#endif
src/infiniop/ops/rms_norm/operator.cc
View file @
2f2a74b6
...
...
@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/rms_norm_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rms_norm_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateRMSNormDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -28,10 +31,8 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaCreateRMSNormDescriptor
((
CudaHandle_t
)
handle
,
(
RMSNormCudaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -76,11 +77,8 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaGetRMSNormWorkspaceSize
((
RMSNormCudaDescriptor_t
)
desc
,
size
);
}
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -122,11 +120,8 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaRMSNorm
((
RMSNormCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -172,11 +167,8 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaDestroyRMSNormDescriptor
((
RMSNormCudaDescriptor_t
)
desc
);
}
#ifdef ENABLE_CUDA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
src/infiniop/ops/rms_norm/rms_norm.h
View file @
2f2a74b6
...
...
@@ -51,6 +51,10 @@ inline infiniStatus_t createRMSNormInfo(RMSNormInfo *info, infiniopTensorDescrip
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
if
(
w_desc
->
stride
(
0
)
!=
1
)
{
return
INFINI_STATUS_BAD_TENSOR_STRIDES
;
}
info
->
shape
=
std
::
move
(
y_desc
->
shape
());
info
->
y_strides
=
std
::
move
(
y_desc
->
strides
());
info
->
x_strides
=
std
::
move
(
x_desc
->
strides
());
...
...
src/infiniop/reduce/cuda/reduce.cuh
0 → 100644
View file @
2f2a74b6
#ifndef __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__
#include <cub/block/block_reduce.cuh>
namespace
op
::
common_cuda
::
reduce_op
{
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
__device__
__forceinline__
Tcompute
sumSquared
(
const
Tdata
*
data_ptr
,
size_t
count
)
{
Tcompute
ss
=
0
;
// Each thread computes its partial sum
for
(
size_t
i
=
threadIdx
.
x
;
i
<
count
;
i
+=
BLOCK_SIZE
)
{
ss
+=
Tcompute
(
data_ptr
[
i
]
*
data_ptr
[
i
]);
}
// Use CUB block-level reduction
using
BlockReduce
=
cub
::
BlockReduce
<
Tcompute
,
BLOCK_SIZE
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
return
BlockReduce
(
temp_storage
).
Sum
(
ss
);
}
}
// namespace op::common_cuda::reduce_op
#endif
src/utils-test/test_rearrange.cc
View file @
2f2a74b6
...
...
@@ -45,28 +45,26 @@ size_t check_equal(
return
fails
;
}
int
test_transpose_2d
()
{
std
::
vector
<
size_t
>
shape
=
{
3
,
5
};
std
::
vector
<
ptrdiff_t
>
strides_a
=
{
5
,
1
};
std
::
vector
<
ptrdiff_t
>
strides_b
=
{
1
,
3
};
int
test_transpose_any
(
size_t
index
,
std
::
vector
<
size_t
>
shape
,
std
::
vector
<
ptrdiff_t
>
strides_a
,
std
::
vector
<
ptrdiff_t
>
strides_b
)
{
auto
numel
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
());
std
::
vector
<
float
>
a
(
numel
);
std
::
vector
<
float
>
b
(
numel
);
for
(
size_t
i
=
0
;
i
<
numel
;
i
++
)
{
a
[
i
]
=
i
/
numel
;
a
[
i
]
=
(
float
)
i
/
numel
;
}
utils
::
rearrange
(
b
.
data
(),
a
.
data
(),
shape
.
data
(),
strides_b
.
data
(),
strides_a
.
data
(),
2
,
sizeof
(
float
));
if
(
check_equal
<
float
>
(
a
.
data
(),
b
.
data
(),
shape
,
strides_a
,
strides_b
))
{
utils
::
rearrange
(
b
.
data
(),
a
.
data
(),
shape
.
data
(),
strides_b
.
data
(),
strides_a
.
data
(),
shape
.
size
(),
sizeof
(
float
));
auto
fails
=
check_equal
<
float
>
(
a
.
data
(),
b
.
data
(),
shape
,
strides_a
,
strides_b
);
if
(
fails
>
0
)
{
std
::
cout
<<
"test_transpose "
<<
index
<<
" failed"
<<
std
::
endl
;
return
1
;
}
else
{
std
::
cout
<<
"test_transpose_2d passed"
<<
std
::
endl
;
std
::
cout
<<
"test_transpose "
<<
index
<<
" passed"
<<
std
::
endl
;
return
0
;
}
return
0
;
}
int
test_rearrange
()
{
return
test_transpose_2d
();
return
test_transpose_any
(
1
,
{
3
,
5
},
{
5
,
1
},
{
1
,
3
})
+
test_transpose_any
(
2
,
{
1
,
2048
},
{
2048
,
1
},
{
2048
,
1
});
}
src/utils/check.h
View file @
2f2a74b6
...
...
@@ -17,4 +17,17 @@
#define CHECK_STATUS(API) CHECK_API_OR(API, INFINI_STATUS_SUCCESS, return api_result_)
#define CHECK_DTYPE(DT, ...) \
do { \
auto found_supported_dtype = false; \
for (auto dt : {__VA_ARGS__}) { \
if (dt == DT) { \
found_supported_dtype = true; \
break; \
} \
} \
CHECK_API_OR(found_supported_dtype, true, \
return INFINI_STATUS_BAD_TENSOR_DTYPE); \
} while (0)
#endif // INFINIUTILS_CHECK_H
src/utils/rearrange.cc
View file @
2f2a74b6
...
...
@@ -46,6 +46,7 @@ std::optional<RearrangeMeta> RearrangeMeta::create(
}
return
std
::
abs
(
a
.
dst
)
>
std
::
abs
(
b
.
dst
);
});
ndim
=
dims
.
size
();
// # 合并连续维度
// ## 合并末尾连续维度到 unit
for
(
auto
it
=
dims
.
rbegin
();
it
!=
dims
.
rend
();
++
it
)
{
...
...
test/infiniop-test/README.md
0 → 100644
View file @
2f2a74b6
# InfiniOP 测例生成
## 介绍
使用 python 脚本生成包含测例的
`.gguf`
文件,并使用
`infiniop-test`
程序进行测试。
## 运行方式
-
编译
`infiniop-test`
程序
```
bash
xmake build infiniop-test
```
-
生成测例
在
`/test/infiniop-test/`
目录执行矩阵乘测例生成脚本,执行结束以后会在
`/test/infiniop-test/`
目录生成
`gemm.gguf`
测例文件。
```
bash
cd
/test/infiniop-test/
python
-m
test_generate.testcases.gemm
```
-
测试测例
打印测试程序用法
```
bash
infiniop-test
--help
```
示例:在CPU上测试
`gemm.gguf`
测例文件,预热20次,测试1000次。
```
bash
infiniop-test gemm.gguf
--cpu
--warmup
20
--run
1000
```
## 自定义测例
### GGUF文件格式
```
text
GGUF File Contents:
Version: 3
Number of Meta KVs: 8
Number of Tensors: 4
Meta KVs:
Key: general.architecture, Type: GGUF_TYPE_STRING, Value: infiniop-test
Key: test_count, Type: GGUF_TYPE_UINT64, Value: 1
Key: test.0.op_name, Type: GGUF_TYPE_STRING, Value: matmul
Key: test.0.a.strides, Type: GGUF_TYPE_INT32, Value: [1, 5]
Key: test.0.b.strides, Type: GGUF_TYPE_INT32, Value: [1, 6]
Key: test.0.c.strides, Type: GGUF_TYPE_INT32, Value: [1, 6]
Key: test.0.alpha, Type: GGUF_TYPE_FLOAT32, Value: 1.000000
Key: test.0.beta, Type: GGUF_TYPE_FLOAT32, Value: 0.000000
Tensor INFOs:
Name: test.0.a, NDims: 2, Shape: [5, 4], DataType: F32, DataOffset: 0
Name: test.0.b, NDims: 2, Shape: [6, 5], DataType: F32, DataOffset: 96
Name: test.0.c, NDims: 2, Shape: [6, 4], DataType: F32, DataOffset: 224
Name: test.0.ans, NDims: 2, Shape: [6, 4], DataType: F64, DataOffset: 320
```
-
`Meta`
中必须包含
`test_count`
,表示测例数量。
-
每个测例的
`Meta`
和
`Tensor`
名字以
`test.[id].`
开头,后接具体信息名称。数字
`[id]`
表示测例编号。编号必须为 0 到 test_count-1.
-
`Tensor`
名字接
`.strides`
表示步长,若没有则默认为连续。
test/infiniop-test/test_generate/__init__.py
0 → 100644
View file @
2f2a74b6
from
.infiniop_test
import
InfiniopTestCase
,
InfiniopTestWriter
,
np_dtype_to_ggml
,
gguf_strides
test/infiniop-test/test_generate/infiniop_test.py
0 → 100644
View file @
2f2a74b6
import
gguf
from
typing
import
List
import
numpy
as
np
from
gguf
import
GGMLQuantizationType
def
np_dtype_to_ggml
(
tensor_dtype
:
np
.
dtype
):
if
tensor_dtype
==
np
.
float16
:
return
GGMLQuantizationType
.
F16
elif
tensor_dtype
==
np
.
float32
:
return
GGMLQuantizationType
.
F32
elif
tensor_dtype
==
np
.
float64
:
return
GGMLQuantizationType
.
F64
elif
tensor_dtype
==
np
.
int8
:
return
GGMLQuantizationType
.
I8
elif
tensor_dtype
==
np
.
int16
:
return
GGMLQuantizationType
.
I16
elif
tensor_dtype
==
np
.
int32
:
return
GGMLQuantizationType
.
I32
elif
tensor_dtype
==
np
.
int64
:
return
GGMLQuantizationType
.
I64
else
:
raise
ValueError
(
"Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now"
)
def
gguf_strides
(
*
args
:
int
)
->
list
[
int
]
|
None
:
return
list
(
args
)[::
-
1
]
if
args
else
None
class
InfiniopTestCase
:
op_name
:
str
def
__init__
(
self
,
op_name
:
str
):
self
.
op_name
=
op_name
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
test_writer
.
add_string
(
test_writer
.
gguf_key
(
"op_name"
),
self
.
op_name
)
class
InfiniopTestWriter
(
gguf
.
GGUFWriter
):
_test_cases
:
List
[
"InfiniopTestCase"
]
_written_tests
=
0
def
__init__
(
self
,
filepath
):
super
().
__init__
(
filepath
,
"infiniop-test"
)
self
.
_test_cases
=
[]
self
.
_written_tests
=
0
def
add_test
(
self
,
test_case
:
"InfiniopTestCase"
):
self
.
_test_cases
.
append
(
test_case
)
def
add_tests
(
self
,
test_cases
:
List
[
"InfiniopTestCase"
]):
self
.
_test_cases
.
extend
(
test_cases
)
def
gguf_key
(
self
,
name
:
str
)
->
str
:
return
f
"test.
{
self
.
_written_tests
}
.
{
name
}
"
def
save
(
self
):
super
().
add_uint64
(
"test_count"
,
len
(
self
.
_test_cases
))
for
test_case
in
self
.
_test_cases
:
test_case
.
write_test
(
self
)
self
.
_written_tests
+=
1
super
().
write_header_to_file
()
super
().
write_kv_data_to_file
()
super
().
write_tensors_to_file
()
super
().
close
()
test/infiniop-test/test_generate/testcases/__init__.py
0 → 100644
View file @
2f2a74b6
test/infiniop-test/test_generate/testcases/gemm.py
0 → 100644
View file @
2f2a74b6
from
ast
import
List
import
numpy
as
np
import
gguf
from
typing
import
List
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
def
gemm
(
a
:
np
.
ndarray
,
b
:
np
.
ndarray
,
alpha
:
float
=
1.0
,
c
:
np
.
ndarray
=
None
,
beta
:
float
=
0.0
,
):
if
c
is
None
:
return
alpha
*
np
.
matmul
(
a
,
b
)
return
alpha
*
np
.
matmul
(
a
,
b
)
+
beta
*
c
def
random_tensor
(
shape
,
dtype
):
rate
=
1e-3
var
=
0.5
*
rate
# 数值范围在[-5e-4, 5e-4]
return
rate
*
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
-
var
class
GemmTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
a
:
np
.
ndarray
,
stride_a
:
List
[
int
]
|
None
,
b
:
np
.
ndarray
,
stride_b
:
List
[
int
]
|
None
,
c
:
np
.
ndarray
,
stride_c
:
List
[
int
]
|
None
,
alpha
:
float
,
beta
:
float
,
):
super
().
__init__
(
"gemm"
)
self
.
a
=
a
self
.
stride_a
=
stride_a
self
.
b
=
b
self
.
stride_b
=
stride_b
self
.
c
=
c
self
.
stride_c
=
stride_c
self
.
alpha
=
alpha
self
.
beta
=
beta
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
stride_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.strides"
),
self
.
stride_a
)
if
self
.
stride_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.strides"
),
self
.
stride_b
)
if
self
.
stride_c
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.strides"
),
self
.
stride_c
)
test_writer
.
add_float32
(
test_writer
.
gguf_key
(
"alpha"
),
self
.
alpha
)
test_writer
.
add_float32
(
test_writer
.
gguf_key
(
"beta"
),
self
.
beta
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"a"
),
self
.
a
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
a
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"b"
),
self
.
b
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
b
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"c"
),
self
.
c
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
c
.
dtype
)
)
ans
=
gemm
(
self
.
a
.
astype
(
np
.
float64
),
self
.
b
.
astype
(
np
.
float64
),
self
.
alpha
,
self
.
c
.
astype
(
np
.
float64
),
self
.
beta
,
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
gguf
.
GGMLQuantizationType
.
F64
)
if
__name__
==
"__main__"
:
test_writer
=
InfiniopTestWriter
(
"gemm.gguf"
)
# a, stride_a, b, stride_b, c, stride_c, alpha, beta
test_cases
=
[
GemmTestCase
(
random_tensor
((
4
,
5
),
np
.
float32
),
None
,
random_tensor
((
5
,
6
),
np
.
float32
),
None
,
random_tensor
((
4
,
6
),
np
.
float32
),
None
,
1.0
,
0.0
,
),
GemmTestCase
(
random_tensor
((
4
,
5
),
np
.
float32
),
gguf_strides
(
1
,
4
),
random_tensor
((
5
,
6
),
np
.
float32
),
gguf_strides
(
1
,
5
),
random_tensor
((
4
,
6
),
np
.
float32
),
gguf_strides
(
1
,
4
),
1.0
,
1.0
,
),
GemmTestCase
(
random_tensor
((
4
,
5
),
np
.
float16
),
None
,
random_tensor
((
5
,
6
),
np
.
float16
),
None
,
random_tensor
((
4
,
6
),
np
.
float16
),
None
,
1.0
,
0.0
,
),
GemmTestCase
(
random_tensor
((
4
,
5
),
np
.
float16
),
gguf_strides
(
1
,
4
),
random_tensor
((
5
,
6
),
np
.
float16
),
gguf_strides
(
1
,
5
),
random_tensor
((
4
,
6
),
np
.
float16
),
gguf_strides
(
1
,
4
),
1.0
,
1.0
,
),
GemmTestCase
(
random_tensor
((
1
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
2048
),
random_tensor
((
2048
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
2048
),
random_tensor
((
1
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
2048
),
1.0
,
0.0
,
),
GemmTestCase
(
random_tensor
((
1
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2048
,
2048
),
np
.
float32
),
None
,
random_tensor
((
1
,
2048
),
np
.
float32
),
None
,
1.0
,
0.0
,
),
GemmTestCase
(
random_tensor
((
2
,
4
,
2048
),
np
.
float16
),
None
,
random_tensor
((
2
,
2048
,
2048
),
np
.
float16
),
None
,
random_tensor
((
2
,
4
,
2048
),
np
.
float16
),
None
,
1.0
,
0.0
,
),
GemmTestCase
(
random_tensor
((
2
,
4
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2
,
2048
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2
,
4
,
2048
),
np
.
float32
),
None
,
1.0
,
0.0
,
),
GemmTestCase
(
random_tensor
((
6
,
2048
),
np
.
float32
),
gguf_strides
(
1
,
2048
),
random_tensor
((
2048
,
2560
),
np
.
float32
),
gguf_strides
(
1
,
2560
),
random_tensor
((
6
,
2560
),
np
.
float32
),
gguf_strides
(
1
,
2560
),
1.0
,
1.0
,
),
GemmTestCase
(
random_tensor
((
4
,
48
,
64
),
np
.
float16
),
None
,
random_tensor
((
4
,
64
,
6
),
np
.
float16
),
None
,
random_tensor
((
4
,
48
,
6
),
np
.
float16
),
None
,
1.0
/
8
,
1.0
,
),
GemmTestCase
(
random_tensor
((
4
,
48
,
64
),
np
.
float32
),
None
,
random_tensor
((
4
,
64
,
6
),
np
.
float32
),
None
,
random_tensor
((
4
,
48
,
6
),
np
.
float32
),
None
,
1.0
/
8
,
1.0
,
),
]
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop/gemm.py
View file @
2f2a74b6
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_uint64
,
c_void_p
,
c_float
,
c_bool
import
torch
import
ctypes
import
sys
import
os
import
time
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
)))
from
operatorspy
import
(
open_lib
,
to_tensor
,
DeviceEnum
,
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_size_t
,
c_uint64
,
c_void_p
,
c_float
from
libinfiniop
import
(
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
create_handle
,
destroy_handle
,
open_lib
,
to_tensor
,
get_test_devices
,
check_error
,
rearrange_tensor
,
rearrange_if_needed
,
create_workspace
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
)
from
operatorspy.tests.test_utils
import
get_args
import
torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES
=
[
# alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
2
,
4
,
2048
),
(
2
,
2048
,
2048
),
(
2
,
4
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
2
,
4
,
2048
),
(
2
,
2048
,
2048
),
(
2
,
4
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
(
4096
,
1
),
(
4096
,
1
),
(
4096
,
1
)),
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
(
4096
,
1
),
(
4096
,
1
),
(
4096
,
1
)),
(
1.0
,
1.0
,
(
6
,
2048
),
(
2048
,
2560
),
(
6
,
2560
),
(
2048
,
1
),
(
1
,
2048
),
(
2560
,
1
)),
(
1.0
,
1.0
,
(
6
,
2048
),
(
2048
,
2560
),
(
6
,
2560
),
(
2048
,
1
),
(
1
,
2048
),
(
2560
,
1
)),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
]
# Data types used for testing
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
1e-2
},
torch
.
float32
:
{
"atol"
:
0
,
"rtol"
:
1e-3
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
GEMMDescriptor
(
Structure
):
# ==============================================================================
# Definitions
# ==============================================================================
class
GemmDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
infiniopG
EMM
Descriptor_t
=
POINTER
(
G
EMM
Descriptor
)
infiniopG
emm
Descriptor_t
=
POINTER
(
G
emm
Descriptor
)
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
A
=
A
.
T
if
transA
else
A
B
=
B
.
T
if
transB
else
B
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
),
).
to
(
dtype
)
if
C
is
not
None
:
result
+=
beta
*
C
if
dtype
!=
torch
.
float16
else
C
.
to
(
torch
.
float32
)
if
PROFILE
:
torch
.
cuda
.
synchronize
()
return
result
# PyTorch implementation for matrix multiplication
def
gemm
(
_c
,
beta
,
_a
,
_b
,
alpha
):
a
,
b
,
c
=
_a
.
clone
(),
_b
.
clone
(),
_c
.
clone
()
result_dtype
=
c
.
dtype
fp32_result
=
torch
.
matmul
(
a
.
to
(
torch
.
float32
),
b
.
to
(
torch
.
float32
))
return
alpha
*
fp32_result
.
to
(
result_dtype
)
+
beta
*
c
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def
test
(
lib
,
handle
,
torch_device
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
=
None
,
b_stride
=
None
,
c_stride
=
None
,
y_stride
=
None
,
dtype
=
torch
.
float16
,
):
print
(
f
"Testing G
EMM
on
{
torch_device
}
with
transA:
{
transA
}
transB:
{
transB
}
"
f
"a_shape:
{
a_shape
}
b_shape:
{
b_shape
}
c_shape:
{
c_shape
}
y_shape:
{
y_shape
}
"
f
"a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
f
"Testing G
emm
on
{
torch_device
}
with
alpha:
{
alpha
}
, beta:
{
beta
}
,
"
f
"
a_shape:
{
a_shape
}
,
b_shape:
{
b_shape
}
,
c_shape:
{
c_shape
}
,
"
f
"
a_stride:
{
a_stride
}
,
b_stride:
{
b_stride
}
,
c_stride:
{
c_stride
}
,
dtype:
{
dtype
}
"
)
# Initialize tensors
a
=
torch
.
rand
(
a_shape
,
dtype
=
dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
dtype
).
to
(
torch_device
)
c
=
torch
.
rand
(
c_shape
,
dtype
=
dtype
).
to
(
torch_device
)
if
c_shape
else
None
y
=
torch
.
rand
(
y_shape
,
dtype
=
dtype
).
to
(
torch_device
)
c
=
torch
.
ones
(
c_shape
,
dtype
=
dtype
).
to
(
torch_device
)
if
a_stride
is
not
None
:
a
=
rearrange_tensor
(
a
,
a_stride
)
if
b_stride
is
not
None
:
b
=
rearrange_tensor
(
b
,
b_stride
)
if
c_stride
is
not
None
and
c
is
not
None
:
c
=
rearrange_tensor
(
c
,
c_stride
)
if
y_stride
is
not
None
:
y
=
rearrange_tensor
(
y
,
y_stride
)
# Compute the PyTorch reference result
ans
=
gemm
(
c
,
beta
,
a
,
b
,
alpha
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
ans
=
gemm
(
a
,
b
,
c
,
transA
,
transB
,
alpha
,
beta
,
dtype
)
if
PROFILE
:
start_time
=
time
.
time
()
for
i
in
range
(
NUM_ITERATIONS
):
_
=
gemm
(
a
,
b
,
c
,
transA
,
transB
,
alpha
,
beta
,
dtype
)
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
"pytorch time:
{
elapsed
:
6
f
}
"
)
a
,
b
,
c
=
[
rearrange_if_needed
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
a
,
b
,
c
],
[
a_stride
,
b_stride
,
c_stride
])
]
a_tensor
,
b_tensor
,
c_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
,
c
]]
a_tensor
=
to_tensor
(
a
,
lib
)
b_tensor
=
to_tensor
(
b
,
lib
)
c_tensor
=
to_tensor
(
c
,
lib
)
if
c
is
not
None
else
None
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopGEMMDescriptor_t
()
descriptor
=
infiniopGemmDescriptor_t
()
check_error
(
lib
.
infiniopCreateG
EMM
Descriptor
(
lib
.
infiniopCreateG
emm
Descriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
y
_tensor
.
descriptor
,
c
_tensor
.
descriptor
,
a_tensor
.
descriptor
,
b_tensor
.
descriptor
,
c_tensor
.
descriptor
if
c_tensor
else
None
,
alpha
,
beta
,
transA
,
transB
,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
a_tensor
.
descriptor
.
contents
.
invalidate
()
b_tensor
.
descriptor
.
contents
.
invalidate
()
if
c_tensor
is
not
None
:
c_tensor
.
descriptor
.
contents
.
invalidate
()
y_tensor
.
descriptor
.
contents
.
invalidate
()
for
tensor
in
[
a_tensor
,
b_tensor
,
c_tensor
]:
tensor
.
destroyDesc
(
lib
)
workspace_size
=
ctypes
.
c_uint64
(
0
)
# Get workspace size and create workspace
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetG
EMM
WorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
lib
.
infiniopGetG
emm
WorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
torch
.
zeros
(
int
(
workspace_size
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
workspace
=
create_workspace
(
workspace_size
.
value
,
a
.
device
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
# Execute infiniop gemm operator
def
lib_gemm
():
check_error
(
lib
.
infiniopG
EMM
(
lib
.
infiniopG
emm
(
descriptor
,
workspace
_ptr
,
workspace_size
,
y
_tensor
.
data
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
c
_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
c_tensor
.
data
if
c_tensor
else
None
,
alpha
,
beta
,
None
,
)
)
if
PROFILE
:
start_time
=
time
.
time
()
for
i
in
range
(
NUM_ITERATIONS
):
check_error
(
lib
.
infiniopGEMM
(
descriptor
,
workspace_ptr
,
workspace_size
,
y_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
c_tensor
.
data
if
c_tensor
else
None
,
None
,
)
)
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
" lib time:
{
elapsed
:
6
f
}
"
)
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-2
)
check_error
(
lib
.
infiniopDestroyGEMMDescriptor
(
descriptor
))
lib_gemm
()
def
test_cpu
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
(
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
# Validate results
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
(
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
(
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
)
in
test_cases
:
# Profiling workflow
if
PROFILE
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
profile_operation
(
"PyTorch"
,
lambda
:
gemm
(
c
,
beta
,
a
,
b
,
alpha
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_gemm
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
destroy_handle
(
lib
,
handle
)
check_error
(
lib
.
infiniopDestroyGemmDescriptor
(
descriptor
)
)
# ==============================================================================
# Main Execution
# ==============================================================================
if
__name__
==
"__main__"
:
test_cases
=
[
# alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride
(
1.0
,
1.0
,
False
,
False
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
(
1
,
2048
),
None
,
None
,
None
,
None
,
),
(
1.0
,
1.0
,
True
,
True
,
(
2048
,
4
),
(
2048
,
2048
),
(
4
,
2048
),
(
4
,
2048
),
None
,
None
,
None
,
None
,
),
(
1.0
,
1.0
,
False
,
True
,
(
1
,
2048
),
(
1000
,
2048
),
(
1000
),
(
1
,
1000
),
None
,
None
,
None
,
None
,
),
(
1.0
,
1.0
,
True
,
False
,
(
2048
,
4
),
(
2048
,
2048
),
(
2048
),
(
4
,
2048
),
(
4096
,
1
),
(
4096
,
1
),
(
2
,),
(
4096
,
1
),
),
(
1.0
,
1.0
,
False
,
False
,
(
3
,
1
,
2048
),
(
3
,
2048
,
2048
),
(
1
,),
(
3
,
1
,
2048
),
None
,
None
,
None
,
None
,
),
(
1.0
,
1.0
,
True
,
False
,
(
2048
,
4
),
(
2048
,
2048
),
None
,
(
4
,
2048
),
(
4096
,
1
),
(
4096
,
1
),
(
2
,),
(
4096
,
1
),
),
]
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateG
EMM
Descriptor
.
restype
=
c_int32
lib
.
infiniopCreateG
EMM
Descriptor
.
argtypes
=
[
lib
.
infiniopCreateG
emm
Descriptor
.
restype
=
c_int32
lib
.
infiniopCreateG
emm
Descriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopG
EMM
Descriptor_t
),
POINTER
(
infiniopG
emm
Descriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
c_float
,
c_float
,
c_bool
,
c_bool
,
]
lib
.
infiniopGetG
EMM
WorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetG
EMM
WorkspaceSize
.
argtypes
=
[
infiniopG
EMM
Descriptor_t
,
POINTER
(
c_
uint64
),
lib
.
infiniopGetG
emm
WorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetG
emm
WorkspaceSize
.
argtypes
=
[
infiniopG
emm
Descriptor_t
,
POINTER
(
c_
size_t
),
]
lib
.
infiniopG
EMM
.
restype
=
c_int32
lib
.
infiniopG
EMM
.
argtypes
=
[
infiniopG
EMM
Descriptor_t
,
lib
.
infiniopG
emm
.
restype
=
c_int32
lib
.
infiniopG
emm
.
argtypes
=
[
infiniopG
emm
Descriptor_t
,
c_void_p
,
c_uint64
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
c_float
,
c_float
,
c_void_p
,
]
lib
.
infiniopDestroyG
EMM
Descriptor
.
restype
=
c_int32
lib
.
infiniopDestroyG
EMM
Descriptor
.
argtypes
=
[
infiniopG
EMM
Descriptor_t
,
lib
.
infiniopDestroyG
emm
Descriptor
.
restype
=
c_int32
lib
.
infiniopDestroyG
emm
Descriptor
.
argtypes
=
[
infiniopG
emm
Descriptor_t
,
]
if
args
.
cpu
:
test_cpu
(
lib
,
test_cases
)
if
args
.
cuda
:
test_cuda
(
lib
,
test_cases
)
if
args
.
bang
:
test_bang
(
lib
,
test_cases
)
if
not
(
args
.
cpu
or
args
.
cuda
or
args
.
bang
):
test_cpu
(
lib
,
test_cases
)
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
# Execute tests
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/matmul.py
deleted
100644 → 0
View file @
1d95ddf3
import
torch
import
ctypes
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_size_t
,
c_uint64
,
c_void_p
,
c_float
from
libinfiniop
import
(
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
open_lib
,
to_tensor
,
get_test_devices
,
check_error
,
rearrange_if_needed
,
create_workspace
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES
=
[
# alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
2
,
4
,
2048
),
(
2
,
2048
,
2048
),
(
2
,
4
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
2
,
4
,
2048
),
(
2
,
2048
,
2048
),
(
2
,
4
,
2048
),
None
,
None
,
None
),
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
(
4096
,
1
),
(
4096
,
1
),
(
4096
,
1
)),
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
(
4096
,
1
),
(
4096
,
1
),
(
4096
,
1
)),
(
1.0
,
1.0
,
(
6
,
2048
),
(
2048
,
2560
),
(
6
,
2560
),
(
2048
,
1
),
(
1
,
2048
),
(
2560
,
1
)),
(
1.0
,
1.0
,
(
6
,
2048
),
(
2048
,
2560
),
(
6
,
2560
),
(
2048
,
1
),
(
1
,
2048
),
(
2560
,
1
)),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
]
# Data types used for testing
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
1e-2
},
torch
.
float32
:
{
"atol"
:
0
,
"rtol"
:
1e-3
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
# ==============================================================================
# Definitions
# ==============================================================================
class
MatmulDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
infiniopMatmulDescriptor_t
=
POINTER
(
MatmulDescriptor
)
# PyTorch implementation for matrix multiplication
def
matmul
(
_c
,
beta
,
_a
,
_b
,
alpha
):
a
,
b
,
c
=
_a
.
clone
(),
_b
.
clone
(),
_c
.
clone
()
result_dtype
=
c
.
dtype
fp32_result
=
torch
.
matmul
(
a
.
to
(
torch
.
float32
),
b
.
to
(
torch
.
float32
))
return
alpha
*
fp32_result
.
to
(
result_dtype
)
+
beta
*
c
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def
test
(
lib
,
handle
,
torch_device
,
alpha
,
beta
,
a_shape
,
b_shape
,
c_shape
,
a_stride
=
None
,
b_stride
=
None
,
c_stride
=
None
,
dtype
=
torch
.
float16
,
):
print
(
f
"Testing Matmul on
{
torch_device
}
with alpha:
{
alpha
}
, beta:
{
beta
}
,"
f
" a_shape:
{
a_shape
}
, b_shape:
{
b_shape
}
, c_shape:
{
c_shape
}
,"
f
" a_stride:
{
a_stride
}
, b_stride:
{
b_stride
}
, c_stride:
{
c_stride
}
, dtype:
{
dtype
}
"
)
# Initialize tensors
a
=
torch
.
rand
(
a_shape
,
dtype
=
dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
dtype
).
to
(
torch_device
)
c
=
torch
.
ones
(
c_shape
,
dtype
=
dtype
).
to
(
torch_device
)
# Compute the PyTorch reference result
ans
=
matmul
(
c
,
beta
,
a
,
b
,
alpha
)
a
,
b
,
c
=
[
rearrange_if_needed
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
a
,
b
,
c
],
[
a_stride
,
b_stride
,
c_stride
])
]
a_tensor
,
b_tensor
,
c_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
,
c
]]
descriptor
=
infiniopMatmulDescriptor_t
()
check_error
(
lib
.
infiniopCreateMatmulDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
c_tensor
.
descriptor
,
a_tensor
.
descriptor
,
b_tensor
.
descriptor
,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
a_tensor
,
b_tensor
,
c_tensor
]:
tensor
.
destroyDesc
(
lib
)
# Get workspace size and create workspace
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetMatmulWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
create_workspace
(
workspace_size
.
value
,
a
.
device
)
# Execute infiniop matmul operator
def
lib_matmul
():
check_error
(
lib
.
infiniopMatmul
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
c_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
alpha
,
beta
,
None
,
)
)
lib_matmul
()
# Validate results
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
matmul
(
c
,
beta
,
a
,
b
,
alpha
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_matmul
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
lib
.
infiniopDestroyMatmulDescriptor
(
descriptor
))
# ==============================================================================
# Main Execution
# ==============================================================================
if
__name__
==
"__main__"
:
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateMatmulDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateMatmulDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopMatmulDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetMatmulWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetMatmulWorkspaceSize
.
argtypes
=
[
infiniopMatmulDescriptor_t
,
POINTER
(
c_size_t
),
]
lib
.
infiniopMatmul
.
restype
=
c_int32
lib
.
infiniopMatmul
.
argtypes
=
[
infiniopMatmulDescriptor_t
,
c_void_p
,
c_uint64
,
c_void_p
,
c_void_p
,
c_void_p
,
c_float
,
c_float
,
c_void_p
,
]
lib
.
infiniopDestroyMatmulDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyMatmulDescriptor
.
argtypes
=
[
infiniopMatmulDescriptor_t
,
]
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
# Execute tests
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/random_sample.py
View file @
2f2a74b6
...
...
@@ -82,25 +82,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
globalM
=
dataNp
[
0
]
dataNp
=
(
dataNp
-
globalM
)
/
temperature
dataNp
=
torch
.
softmax
(
dataNp
.
float
(),
dim
=
0
)
sum_s
=
0
for
end
in
range
(
topk
):
sum_s
+=
dataNp
[
end
]
if
sum_s
>=
topp
:
break
if
end
<
topk
-
1
:
end
+=
1
else
:
end
=
topk
sum_s
=
0
for
i
in
range
(
end
):
sum_s
+=
dataNp
[
i
]
random_val
*=
sum_s
sum_s
=
0
for
i
in
range
(
end
):
sum_s
+=
dataNp
[
i
]
if
random_val
<
sum_s
:
for
i
in
range
(
1
,
voc
):
dataNp
[
i
]
+=
dataNp
[
i
-
1
]
limit_k
=
dataNp
[
min
(
topk
,
voc
)
-
1
]
limit_p
=
dataNp
[
voc
-
1
]
*
topp
limit
=
min
(
limit_k
,
limit_p
)
*
random_val
for
i
in
range
(
voc
):
if
limit
<
dataNp
[
i
]:
return
indices
[
i
]
else
:
return
torch
.
argmax
(
data
)
...
...
@@ -129,7 +118,7 @@ def test(
data
,
random_val
,
topp
,
topk
,
voc
,
temperature
)
# 这个函数在device速度可能会很慢,可以通过data.to("cpu")方式加快计算过程
indices
=
torch
.
zeros
([
1
],
dtype
=
torch
.
int64
).
to
(
torch_device
)
indices
=
torch
.
zeros
([],
dtype
=
torch
.
int64
).
to
(
torch_device
)
x_tensor
,
indices_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
data
,
indices
]]
...
...
@@ -147,7 +136,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
x_tensor
,
indices_tensor
]:
tensor
.
des
criptor
.
contents
.
invalidate
(
)
tensor
.
des
troyDesc
(
lib
)
workspace_size
=
c_uint64
(
0
)
check_error
(
...
...
@@ -181,13 +170,13 @@ def test(
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug_all
(
(
indices
[
0
]
.
type
(
ans
.
dtype
),
data
[
indices
[
0
]
]),
(
indices
.
type
(
ans
.
dtype
),
data
[
indices
]),
(
ans
,
data
[
ans
]),
"or"
,
atol
=
atol
,
rtol
=
rtol
,
)
assert
indices
[
0
]
.
type
(
ans
.
dtype
)
==
ans
or
data
[
ans
]
==
data
[
indices
[
0
]
]
assert
indices
.
type
(
ans
.
dtype
)
==
ans
or
data
[
ans
]
==
data
[
indices
]
# Profiling workflow
if
PROFILE
:
...
...
xmake.lua
View file @
2f2a74b6
...
...
@@ -145,7 +145,7 @@ target("infinirt")
set_languages
(
"cxx17"
)
set_installdir
(
os.getenv
(
"INFINI_ROOT"
)
or
(
os.getenv
(
is_host
(
"windows"
)
and
"HOMEPATH"
or
"HOME"
)
..
"/.infini"
))
add_files
(
"src/infinirt/*.cc"
)
add_installfiles
(
"include/infinirt.h"
)
add_installfiles
(
"include/infinirt.h"
,
{
prefixdir
=
"include"
}
)
target_end
()
target
(
"infiniop"
)
...
...
@@ -197,8 +197,12 @@ target("infiniop")
add_installfiles
(
"include/infinicore.h"
,
{
prefixdir
=
"include"
})
target_end
()
target
(
"all"
)
set_kind
(
"phony"
)
add_deps
(
"infiniop"
,
"infinirt"
)
after_build
(
function
(
target
)
print
(
YELLOW
..
"[Congratulations!] Now you can install the libraries with \"
xmake
install
\
""
..
NC
)
end
)
target_end
()
-- Tests
includes
(
"xmake/test.lua"
)
xmake/test.lua
View file @
2f2a74b6
...
...
@@ -8,4 +8,27 @@ target("infiniutils-test")
add_files
(
os
.
projectdir
()
..
"/src/utils-test/*.cc"
)
target
(
"infiniop-test"
)
set_kind
(
"binary"
)
add_deps
(
"infini-utils"
)
on_install
(
function
(
target
)
end
)
set_default
(
false
)
local
INFINI_ROOT
=
os.getenv
(
"INFINI_ROOT"
)
or
(
os.getenv
(
is_host
(
"windows"
)
and
"HOMEPATH"
or
"HOME"
)
..
"/.infini"
)
set_languages
(
"cxx17"
)
set_warnings
(
"all"
,
"error"
)
add_includedirs
(
INFINI_ROOT
..
"/include"
)
add_linkdirs
(
INFINI_ROOT
..
"/lib"
)
add_links
(
"infiniop"
,
"infinirt"
)
if
has_config
(
"omp"
)
then
add_cxflags
(
"-fopenmp"
)
add_ldflags
(
"-fopenmp"
)
end
add_includedirs
(
os
.
projectdir
()
..
"/src/infiniop-test/include"
)
add_files
(
os
.
projectdir
()
..
"/src/infiniop-test/src/*.cpp"
)
add_files
(
os
.
projectdir
()
..
"/src/infiniop-test/src/ops/*.cpp"
)
target_end
()
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment