Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
8b59f4fe
"dockerfile/cuda12.2.dockerfile" did not exist on "62a2913497a866754ae96d57ef445d8cec6e89b2"
Commit
8b59f4fe
authored
May 20, 2025
by
Catheriany
Browse files
Merge remote-tracking branch 'origin/main' into issue/204
parents
16506fc0
df1c6b5d
Changes
65
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
432 additions
and
204 deletions
+432
-204
src/infiniop/reduce/kunlun/reduce_kunlun.h
src/infiniop/reduce/kunlun/reduce_kunlun.h
+3
-1
src/infiniop/tensor.h
src/infiniop/tensor.h
+13
-3
src/infiniop/tensor_descriptor.cc
src/infiniop/tensor_descriptor.cc
+16
-22
src/infinirt/bang/infinirt_bang.cc
src/infinirt/bang/infinirt_bang.cc
+6
-5
src/infinirt/bang/infinirt_bang.h
src/infinirt/bang/infinirt_bang.h
+1
-1
src/infinirt/infinirt.cc
src/infinirt/infinirt.cc
+5
-1
test/infiniop-test/test_generate/testcases/swiglu.py
test/infiniop-test/test_generate/testcases/swiglu.py
+250
-0
test/infiniop/attention.py
test/infiniop/attention.py
+70
-165
test/infiniop/avg_pool.py
test/infiniop/avg_pool.py
+5
-0
test/infiniop/causal_softmax.py
test/infiniop/causal_softmax.py
+8
-1
test/infiniop/conv.py
test/infiniop/conv.py
+5
-1
test/infiniop/expand.py
test/infiniop/expand.py
+5
-1
test/infiniop/gemm.py
test/infiniop/gemm.py
+4
-0
test/infiniop/global_avg_pool.py
test/infiniop/global_avg_pool.py
+5
-1
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+13
-0
test/infiniop/max_pool.py
test/infiniop/max_pool.py
+5
-1
test/infiniop/mlp.py
test/infiniop/mlp.py
+5
-0
test/infiniop/random_sample.py
test/infiniop/random_sample.py
+4
-0
test/infiniop/rearrange.py
test/infiniop/rearrange.py
+4
-0
test/infiniop/relu.py
test/infiniop/relu.py
+5
-1
No files found.
src/infiniop/reduce/kunlun/reduce_kunlun.h
View file @
8b59f4fe
#ifndef __INFINIOP_REDUCE_KUNLUN_H__
#define __INFINIOP_REDUCE_KUNLUN_H__
#include "../../devices/kunlun/kunlun_common.h"
#include "../../devices/kunlun/kunlun_
kernel_
common.h"
namespace
op
::
common_kunlun
::
reduce_op
{
using
namespace
device
::
kunlun
::
kernel
;
// Use 16 floats instruction to calculate reduce
// data_ptr is the pointer of LM
static
inline
__device__
float
sumSquaredF32
(
float
*
data_ptr
,
int
count
)
{
...
...
src/infiniop/tensor.h
View file @
8b59f4fe
...
...
@@ -2,9 +2,19 @@
#define __INFINIOP_TENSOR_H__
#include "infiniop/tensor_descriptor.h"
#include "../utils.h"
#include <string>
#include <vector>
#define TRANSFORM_TENSOR_DESC(__TENSOR_DESC__, __OP__) \
do { \
auto __RESULT__ = __TENSOR_DESC__->__OP__; \
CHECK_RESULT(__RESULT__); \
__TENSOR_DESC__ = __RESULT__.take(); \
} while (0)
struct
InfiniopTensorDescriptor
{
private:
// Datatype
...
...
@@ -32,9 +42,9 @@ public:
bool
hasBroadcastDim
()
const
;
std
::
vector
<
size_t
>
getBroadcastDim
()
const
;
infiniopTensorDescriptor_t
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
;
infiniopTensorDescriptor_t
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
;
infiniopTensorDescriptor_t
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
;
utils
::
Result
<
infiniopTensorDescriptor_t
>
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
;
utils
::
Result
<
infiniopTensorDescriptor_t
>
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
;
utils
::
Result
<
infiniopTensorDescriptor_t
>
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
;
std
::
string
toString
()
const
;
};
...
...
src/infiniop/tensor_descriptor.cc
View file @
8b59f4fe
...
...
@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip
std
::
vector
<
ptrdiff_t
>
strides
(
ndim
);
ptrdiff_t
dsize
=
1
;
if
(
ndim
>
0
)
{
for
(
size_
t
i
=
ndim
-
1
;
i
>=
0
;
i
--
)
{
for
(
in
t
i
=
(
int
)
ndim
-
1
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dsize
;
dsize
*=
shape_
[
i
];
}
...
...
@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
return
res
;
}
infiniopTensorDescriptor_t
InfiniopTensorDescriptor
::
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
{
if
(
dim_start
>
dim_end
||
dim_end
>=
ndim
())
{
return
nullptr
;
}
utils
::
Result
<
infiniopTensorDescriptor_t
>
InfiniopTensorDescriptor
::
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
{
CHECK_OR_RETURN
(
dim_start
<=
dim_end
&&
dim_end
<
ndim
(),
INFINI_STATUS_BAD_PARAM
);
size_t
new_ndim
=
ndim
()
-
(
dim_end
-
dim_start
);
std
::
vector
<
size_t
>
new_shape
(
new_ndim
);
...
...
@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index
++
;
}
if
(
!
isContiguous
(
dim_start
,
dim_end
))
{
return
nullptr
;
}
CHECK_OR_RETURN
(
isContiguous
(
dim_start
,
dim_end
),
INFINI_STATUS_BAD_PARAM
);
new_shape
[
index
]
=
1
;
for
(
size_t
i
=
dim_start
;
i
<=
dim_end
;
i
++
)
{
...
...
@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index
++
;
}
return
new
InfiniopTensorDescriptor
(
_dtype
,
new_ndim
,
new_shape
.
data
(),
new_strides
.
data
());
return
utils
::
Result
<
infiniopTensorDescriptor_t
>
(
new
InfiniopTensorDescriptor
(
_dtype
,
new_ndim
,
new_shape
.
data
(),
new_strides
.
data
()));
}
infiniopTensorDescriptor_t
InfiniopTensorDescriptor
::
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
{
utils
::
Result
<
infiniopTensorDescriptor_t
>
InfiniopTensorDescriptor
::
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
{
size_t
ndim_
=
ndim
();
if
(
dim
(
axis
)
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
()))
{
return
nullptr
;
}
CHECK_OR_RETURN
(
dim
(
axis
)
==
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
()),
INFINI_STATUS_BAD_PARAM
);
size_t
new_ndim
=
ndim_
+
dims
.
size
()
-
1
;
std
::
vector
<
size_t
>
new_shape
(
new_ndim
);
...
...
@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const
index
++
;
}
return
new
InfiniopTensorDescriptor
(
_dtype
,
new_ndim
,
new_shape
.
data
(),
new_strides
.
data
());
return
utils
::
Result
<
infiniopTensorDescriptor_t
>
(
new
InfiniopTensorDescriptor
(
_dtype
,
new_ndim
,
new_shape
.
data
(),
new_strides
.
data
()));
}
infiniopTensorDescriptor_t
InfiniopTensorDescriptor
::
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
{
utils
::
Result
<
infiniopTensorDescriptor_t
>
InfiniopTensorDescriptor
::
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
{
auto
ndim_
=
ndim
();
if
(
order
.
size
()
!=
ndim_
)
{
return
nullptr
;
}
CHECK_OR_RETURN
(
order
.
size
()
==
ndim_
,
INFINI_STATUS_BAD_PARAM
);
std
::
vector
<
size_t
>
new_shape
(
ndim_
);
std
::
vector
<
ptrdiff_t
>
new_strides
(
ndim_
);
for
(
size_t
i
=
0
;
i
<
ndim_
;
i
++
)
{
if
(
std
::
find
(
order
.
begin
(),
order
.
end
(),
i
)
==
order
.
end
())
{
return
nullptr
;
}
CHECK_OR_RETURN
(
std
::
find
(
order
.
begin
(),
order
.
end
(),
i
)
!=
order
.
end
(),
INFINI_STATUS_BAD_PARAM
);
new_shape
[
i
]
=
dim
(
order
[
i
]);
new_strides
[
i
]
=
stride
(
order
[
i
]);
}
return
new
InfiniopTensorDescriptor
(
_dtype
,
ndim_
,
new_shape
.
data
(),
new_strides
.
data
());
return
utils
::
Result
<
infiniopTensorDescriptor_t
>
(
new
InfiniopTensorDescriptor
(
_dtype
,
ndim_
,
new_shape
.
data
(),
new_strides
.
data
()));
}
std
::
string
InfiniopTensorDescriptor
::
toString
()
const
{
...
...
src/infinirt/bang/infinirt_bang.cc
View file @
8b59f4fe
...
...
@@ -6,7 +6,8 @@
namespace
infinirt
::
bang
{
infiniStatus_t
getDeviceCount
(
int
*
count
)
{
CHECK_BANGRT
(
cnrtGetDeviceCount
(
count
));
unsigned
int
device_count
=
static_cast
<
unsigned
int
>
(
*
count
);
CHECK_BANGRT
(
cnrtGetDeviceCount
(
&
device_count
));
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() {
infiniStatus_t
streamCreate
(
infinirtStream_t
*
stream_ptr
)
{
cnrtQueue_t
queue
;
CHECK_BANGRT
(
cnrtQueueCreate
(
&
stream
));
CHECK_BANGRT
(
cnrtQueueCreate
(
&
queue
));
*
stream_ptr
=
queue
;
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
}
infiniStatus_t
eventQuery
(
infinirtEvent_t
event
,
infinirtEventStatus_t
*
status_ptr
)
{
auto
status
=
cnrtQueryNotifier
((
cnrt
Queue_t
)
stream
);
auto
status
=
cnrtQueryNotifier
((
cnrt
Notifier_t
)
event
);
if
(
status
==
cnrtSuccess
)
{
*
status_ptr
=
INFINIRT_EVENT_COMPLETE
;
}
else
if
(
status
==
cnrtErrorBusy
)
{
...
...
@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
}
infiniStatus_t
memcpy
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
infinirtMemcpyKind_t
kind
)
{
CHECK_BANGRT
(
cnrtMemcpy
(
dst
,
src
,
size
,
toBangMemcpyKind
(
kind
)));
CHECK_BANGRT
(
cnrtMemcpy
(
dst
,
(
void
*
)
src
,
size
,
toBangMemcpyKind
(
kind
)));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
memcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
infinirtMemcpyKind_t
kind
,
infinirtStream_t
stream
)
{
CHECK_BANGRT
(
cnrtMemcpyAsync_V2
(
dst
,
src
,
size
,
(
cnrtQueue_t
)
stream
,
toBangMemcpyKind
(
kind
)));
CHECK_BANGRT
(
cnrtMemcpyAsync_V2
(
dst
,
(
void
*
)
src
,
size
,
(
cnrtQueue_t
)
stream
,
toBangMemcpyKind
(
kind
)));
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infinirt/bang/infinirt_bang.h
View file @
8b59f4fe
...
...
@@ -3,7 +3,7 @@
#include "../infinirt_impl.h"
namespace
infinirt
::
bang
{
#ifdef ENABLE_
BANG
_API
#ifdef ENABLE_
CAMBRICON
_API
INFINIRT_DEVICE_API_IMPL
#else
INFINIRT_DEVICE_API_NOOP
...
...
src/infinirt/infinirt.cc
View file @
8b59f4fe
...
...
@@ -4,6 +4,7 @@
#include "bang/infinirt_bang.h"
#include "cpu/infinirt_cpu.h"
#include "cuda/infinirt_cuda.cuh"
#include "kunlun/infinirt_kunlun.h"
#include "maca/infinirt_maca.h"
#include "musa/infinirt_musa.h"
...
...
@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
case INFINI_DEVICE_MOORE: \
_status = infinirt::musa::API PARAMS; \
break; \
case INFINI_DEVICE_KUNLUN: \
_status = infinirt::kunlun::API PARAMS; \
break; \
default: \
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
\
_status =
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
} \
{ ACTION; } \
return _status; \
...
...
test/infiniop-test/test_generate/testcases/swiglu.py
0 → 100644
View file @
8b59f4fe
import
numpy
as
np
import
gguf
from
typing
import
List
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
def
swiglu
(
a
:
np
.
ndarray
,
b
:
np
.
ndarray
,
):
c
=
a
*
b
/
(
1.0
+
np
.
exp
(
-
b
))
return
c
class
SwiGLUTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
a
:
np
.
ndarray
,
stride_a
:
List
[
int
]
|
None
,
b
:
np
.
ndarray
,
stride_b
:
List
[
int
]
|
None
,
c
:
np
.
ndarray
,
stride_c
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"swiglu"
)
self
.
a
=
a
self
.
stride_a
=
stride_a
self
.
b
=
b
self
.
stride_b
=
stride_b
self
.
c
=
c
self
.
stride_c
=
stride_c
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
stride_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.strides"
),
self
.
stride_a
)
if
self
.
stride_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.strides"
),
self
.
stride_b
)
if
self
.
stride_c
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.strides"
),
self
.
stride_c
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"a"
),
self
.
a
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
a
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"b"
),
self
.
b
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
b
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"c"
),
self
.
c
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
c
.
dtype
)
)
ans
=
swiglu
(
self
.
a
.
astype
(
np
.
float64
),
self
.
b
.
astype
(
np
.
float64
),
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
gguf
.
GGMLQuantizationType
.
F64
)
if
__name__
==
"__main__"
:
test_writer
=
InfiniopTestWriter
(
"swiglu.gguf"
)
test_cases
=
[
SwiGLUTestCase
(
np
.
random
.
rand
(
64
,
128
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
64
,
128
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
64
,
128
).
astype
(
np
.
float32
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
64
,
121
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
64
,
121
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
64
,
121
).
astype
(
np
.
float32
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
15
,
512
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
15
,
512
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
15
,
512
).
astype
(
np
.
float32
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float32
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float16
),
None
,
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float16
),
None
,
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float16
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float32
),
gguf_strides
(
10
,
1
),
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float32
),
gguf_strides
(
10
,
1
),
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float32
),
gguf_strides
(
10
,
1
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float16
),
gguf_strides
(
10
,
1
),
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float16
),
gguf_strides
(
10
,
1
),
np
.
random
.
rand
(
13
,
4
).
astype
(
np
.
float16
),
gguf_strides
(
10
,
1
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float32
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float16
),
None
,
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float16
),
None
,
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float16
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float32
),
gguf_strides
(
20
,
4
,
1
),
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float32
),
gguf_strides
(
20
,
4
,
1
),
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float32
),
gguf_strides
(
20
,
4
,
1
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float16
),
gguf_strides
(
20
,
4
,
1
),
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float16
),
gguf_strides
(
20
,
4
,
1
),
np
.
random
.
rand
(
13
,
4
,
4
).
astype
(
np
.
float16
),
gguf_strides
(
20
,
4
,
1
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
None
,
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
None
,
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
13312
,
1
),
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
13312
,
1
),
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
13312
,
1
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
13312
,
1
),
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
13312
,
1
),
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
13312
,
1
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
5632
,
1
),
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
5632
,
1
),
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
1
,
16
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
5632
,
1
),
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
5632
,
1
),
np
.
random
.
rand
(
16
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
1
,
16
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
2
,
3
,
400
).
astype
(
np
.
float32
),
gguf_strides
(
1200
,
400
,
1
),
np
.
random
.
rand
(
2
,
3
,
400
).
astype
(
np
.
float32
),
gguf_strides
(
1200
,
400
,
1
),
np
.
random
.
rand
(
2
,
3
,
400
).
astype
(
np
.
float32
),
gguf_strides
(
1
,
2
,
6
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
2
,
3
,
400
).
astype
(
np
.
float16
),
gguf_strides
(
1200
,
400
,
1
),
np
.
random
.
rand
(
2
,
3
,
400
).
astype
(
np
.
float16
),
gguf_strides
(
1200
,
400
,
1
),
np
.
random
.
rand
(
2
,
3
,
400
).
astype
(
np
.
float16
),
gguf_strides
(
1
,
2
,
6
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float32
),
None
,
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float32
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float16
),
None
,
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float16
),
None
,
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float16
),
None
,
),
SwiGLUTestCase
(
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
45056
,
5632
,
1
),
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
45056
,
5632
,
1
),
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float32
),
gguf_strides
(
45056
,
5632
,
1
),
),
SwiGLUTestCase
(
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
45056
,
5632
,
1
),
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
45056
,
5632
,
1
),
np
.
random
.
rand
(
4
,
4
,
5632
).
astype
(
np
.
float16
),
gguf_strides
(
45056
,
5632
,
1
),
),
]
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop/attention.py
View file @
8b59f4fe
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_uint64
,
c_void_p
,
c_float
,
c_bool
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_uint64
,
c_void_p
import
ctypes
import
sys
import
os
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
)))
from
operatorspy
import
(
from
libinfiniop
import
(
open_lib
,
to_tensor
,
CTensor
,
DeviceEnum
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
create_handle
,
destroy_handle
,
check_error
,
rearrange_tensor
,
create_workspace
,
get_args
,
get_test_devices
,
test_operator
,
debug
,
get_tolerance
,
profile_operation
,
)
from
operatorspy.tests.test_utils
import
get_args
import
torch
import
torch.nn.functional
as
F
class
AttentionDescriptor
(
Structure
):
...
...
@@ -95,12 +95,13 @@ def test(
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
=
torch
.
float16
,
q_stride
=
None
,
k_stride
=
None
,
v_stride
=
None
,
k_cache_stride
=
None
,
v_cache_stride
=
None
,
dtype
=
torch
.
float16
,
sync
=
None
,
):
print
(
f
"Testing Attention on
{
torch_device
}
with n_q_head:
{
n_q_head
}
n_kv_head:
{
n_kv_head
}
seq_len:
{
seq_len
}
head_dim:
{
head_dim
}
pos:
{
pos
}
"
...
...
@@ -140,6 +141,9 @@ def test(
k_cache_tensor
=
to_tensor
(
k_cache
,
lib
)
v_cache_tensor
=
to_tensor
(
v_cache
,
lib
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopAttentionDescriptor_t
()
check_error
(
lib
.
infiniopCreateAttentionDescriptor
(
...
...
@@ -156,12 +160,15 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
out_tensor
.
descriptor
.
contents
.
invalidate
()
q_tensor
.
descriptor
.
contents
.
invalidate
()
k_tensor
.
descriptor
.
contents
.
invalidate
()
v_tensor
.
descriptor
.
contents
.
invalidate
()
k_cache_tensor
.
descriptor
.
contents
.
invalidate
()
v_cache_tensor
.
descriptor
.
contents
.
invalidate
()
for
tensor
in
[
out_tensor
,
q_tensor
,
k_tensor
,
v_tensor
,
k_cache_tensor
,
v_cache_tensor
,
]:
tensor
.
destroyDesc
(
lib
)
workspace_size
=
c_uint64
(
0
)
check_error
(
...
...
@@ -169,152 +176,52 @@ def test(
)
workspace
=
create_workspace
(
workspace_size
.
value
,
out
.
device
)
check_error
(
lib
.
infiniopAttention
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
out_tensor
.
data
,
q_tensor
.
data
,
k_tensor
.
data
,
v_tensor
.
data
,
k_cache_tensor
.
data
,
v_cache_tensor
.
data
,
None
,
def
lib_attention
():
check_error
(
lib
.
infiniopAttention
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
out_tensor
.
data
,
q_tensor
.
data
,
k_tensor
.
data
,
v_tensor
.
data
,
k_cache_tensor
.
data
,
v_cache_tensor
.
data
,
None
,
)
)
)
assert
torch
.
allclose
(
out
,
ans
,
atol
=
1e-4
,
rtol
=
1e-2
)
lib_attention
(
)
check_error
(
lib
.
infiniopDestroyAttentionDescriptor
(
descriptor
))
def
test_cpu
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
(
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
in
test_cases
:
test
(
lib
,
handle
,
"cpu"
,
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
# Validate results
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
out
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
out
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
destroy_handle
(
lib
,
handle
)
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
(
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
in
test_cases
:
test
(
lib
,
handle
,
"cuda"
,
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
destroy_handle
(
lib
,
handle
)
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
(
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
in
test_cases
:
test
(
lib
,
handle
,
"mlu"
,
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
destroy_handle
(
lib
,
handle
)
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
attention
(
q
,
k
,
v
,
k_cache
,
v_cache
,
pos
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_attention
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
lib
.
infiniopDestroyAttentionDescriptor
(
descriptor
))
if
__name__
==
"__main__"
:
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
1e-4
,
"rtol"
:
1e-2
},
torch
.
float32
:
{
"atol"
:
1e-6
,
"rtol"
:
1e-4
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
test_cases
=
[
# prefill
(
...
...
@@ -325,7 +232,6 @@ if __name__ == "__main__":
0
,
# pos
2048
,
# k_cache_buf_len
2048
,
# v_cache_buf_len
torch
.
float16
,
# dtype
[
64
,
2560
,
1
],
# q_stride
[
64
,
2560
,
1
],
# k_stride
[
64
,
2560
,
1
],
# v_stride
...
...
@@ -341,7 +247,6 @@ if __name__ == "__main__":
3
,
# pos
2048
,
# k_cache_buf_len
2048
,
# v_cache_buf_len
torch
.
float16
,
# dtype
[
64
,
2560
,
1
],
# q_stride
[
64
,
2560
,
1
],
# k_stride
[
64
,
2560
,
1
],
# v_stride
...
...
@@ -357,7 +262,6 @@ if __name__ == "__main__":
1
,
# pos
8
,
# k_cache_buf_len
8
,
# v_cache_buf_len
torch
.
float16
,
# dtype
None
,
# q_stride
None
,
# k_stride
None
,
# v_stride
...
...
@@ -406,12 +310,13 @@ if __name__ == "__main__":
infiniopAttentionDescriptor_t
,
]
if
args
.
cpu
:
test_cpu
(
lib
,
test_cases
)
if
args
.
cuda
:
test_cuda
(
lib
,
test_cases
)
if
args
.
bang
:
test_bang
(
lib
,
test_cases
)
if
not
(
args
.
cpu
or
args
.
cuda
or
args
.
bang
):
test_cpu
(
lib
,
test_cases
)
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
# Execute tests
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
test_cases
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/avg_pool.py
View file @
8b59f4fe
...
...
@@ -88,6 +88,7 @@ def test(
padding
,
strides
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing AvgPool on
{
torch_device
}
with x_shape:
{
x_shape
}
kernel_shape:
{
k_shape
}
padding:
{
padding
}
strides:
{
strides
}
dtype:
{
tensor_dtype
}
"
...
...
@@ -109,6 +110,10 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopAvgPoolDescriptor_t
()
check_error
(
...
...
test/infiniop/causal_softmax.py
View file @
8b59f4fe
...
...
@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
1e-2
},
torch
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-2
},
}
...
...
@@ -87,6 +87,7 @@ def test(
y_stride
=
None
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing CausalSoftmax on
{
torch_device
}
with shape:
{
shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
inplace:
{
inplace
}
"
...
...
@@ -107,6 +108,9 @@ def test(
y
=
torch
.
zeros
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
rearrange_if_needed
(
y
,
y_stride
)
y_tensor
=
to_tensor
(
y
,
lib
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopCausalSoftmaxDescriptor_t
()
check_error
(
...
...
@@ -139,6 +143,9 @@ def test(
)
lib_causal_softmax
()
if
sync
is
not
None
:
sync
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
...
...
test/infiniop/conv.py
View file @
8b59f4fe
...
...
@@ -95,6 +95,7 @@ def test(
dilations
,
tensor_stride
=
None
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
assert
len
(
pads
)
==
len
(
strides
)
==
len
(
dilations
)
print
(
...
...
@@ -118,8 +119,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
w_tensor
=
to_tensor
(
w
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopConvDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopConvDescriptor_t
()
check_error
(
lib
.
infiniopCreateConvDescriptor
(
handle
,
...
...
test/infiniop/expand.py
View file @
8b59f4fe
...
...
@@ -52,6 +52,7 @@ def test(
y_stride
=
None
,
x_stride
=
None
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing Expand on
{
torch_device
}
with x_shape:
{
x_shape
}
y_shape:
{
y_shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
dtype:
{
tensor_dtype
}
"
...
...
@@ -76,8 +77,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopExpandDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopExpandDescriptor_t
()
check_error
(
lib
.
infiniopCreateExpandDescriptor
(
handle
,
...
...
test/infiniop/gemm.py
View file @
8b59f4fe
...
...
@@ -83,6 +83,7 @@ def test(
b_stride
=
None
,
c_stride
=
None
,
dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing Gemm on
{
torch_device
}
with alpha:
{
alpha
}
, beta:
{
beta
}
,"
...
...
@@ -104,6 +105,9 @@ def test(
]
a_tensor
,
b_tensor
,
c_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
,
c
]]
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopGemmDescriptor_t
()
check_error
(
lib
.
infiniopCreateGemmDescriptor
(
...
...
test/infiniop/global_avg_pool.py
View file @
8b59f4fe
...
...
@@ -51,6 +51,7 @@ def test(
torch_device
,
x_shape
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing GlobalAvgPool on
{
torch_device
}
with input tensor_shape:
{
x_shape
}
dtype:
{
tensor_dtype
}
"
...
...
@@ -70,8 +71,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopGlobalAvgPoolDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopGlobalAvgPoolDescriptor_t
()
check_error
(
lib
.
infiniopCreateGlobalAvgPoolDescriptor
(
handle
,
...
...
test/infiniop/libinfiniop/utils.py
View file @
8b59f4fe
...
...
@@ -423,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
,
get_sync_func
(
device
)
)
finally
:
destroy_handle
(
lib
,
handle
)
...
...
@@ -471,3 +472,15 @@ def get_test_devices(args):
devices_to_test
=
[
InfiniDeviceEnum
.
CPU
]
return
devices_to_test
def
get_sync_func
(
device
):
import
torch
device_str
=
infiniDeviceEnum_str_map
[
device
]
if
device
==
InfiniDeviceEnum
.
CPU
:
sync
=
None
else
:
sync
=
getattr
(
torch
,
device_str
).
synchronize
return
sync
test/infiniop/max_pool.py
View file @
8b59f4fe
...
...
@@ -83,6 +83,7 @@ def test(
padding
,
strides
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing MaxPool on
{
torch_device
}
with x_shape:
{
x_shape
}
kernel_shape:
{
k_shape
}
padding:
{
padding
}
strides:
{
strides
}
dtype:
{
tensor_dtype
}
"
...
...
@@ -104,8 +105,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopMaxPoolDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopMaxPoolDescriptor_t
()
check_error
(
lib
.
infiniopCreateMaxPoolDescriptor
(
handle
,
...
...
test/infiniop/mlp.py
View file @
8b59f4fe
...
...
@@ -65,6 +65,7 @@ def test(
y_stride
=
None
,
w12_stride
=
None
,
w3_stride
=
None
,
sync
=
None
):
print
(
f
"Testing MLP on
{
torch_device
}
with num_tokens:
{
num_tokens
}
hidden_size:
{
hidden_size
}
intermediate_size:
{
intermediate_size
}
"
...
...
@@ -97,6 +98,10 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
w12_tensor
=
to_tensor
(
w12
,
lib
)
w3_tensor
=
to_tensor
(
w3
,
lib
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopMLPDescriptor_t
()
check_error
(
lib
.
infiniopCreateMLPDescriptor
(
...
...
test/infiniop/random_sample.py
View file @
8b59f4fe
...
...
@@ -103,6 +103,7 @@ def test(
topk
,
temperature
,
dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing RandomSample on
{
torch_device
}
with voc:
{
voc
}
random_val:
{
random_val
}
topp:
{
topp
}
topk:
{
topk
}
temperature:
{
temperature
}
dtype:
{
dtype
}
"
...
...
@@ -122,6 +123,9 @@ def test(
indices_tensor
.
descriptor
.
contents
.
dt
=
InfiniDtype
.
U64
# treat int64 as uint64
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopRandomSampleDescriptor_t
()
check_error
(
lib
.
infiniopCreateRandomSampleDescriptor
(
...
...
test/infiniop/rearrange.py
View file @
8b59f4fe
...
...
@@ -131,6 +131,7 @@ def test(
x_stride
,
y_stride
,
dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing Rerrange on
{
torch_device
}
with shape:
{
shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
...
...
@@ -145,6 +146,9 @@ def test(
]
x_tensor
,
y_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
x
,
y
]]
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopRearrangeDescriptor_t
()
check_error
(
...
...
test/infiniop/relu.py
View file @
8b59f4fe
...
...
@@ -55,6 +55,7 @@ def test(
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
sync
=
None
):
print
(
f
"Testing Relu on
{
torch_device
}
with tensor_shape:
{
tensor_shape
}
dtype:
{
tensor_dtype
}
inplace:
{
inplace
.
name
}
"
...
...
@@ -78,8 +79,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
x_tensor
descriptor
=
infiniopReluDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopReluDescriptor_t
()
check_error
(
lib
.
infiniopCreateReluDescriptor
(
handle
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment