Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
216a63b8
Unverified
Commit
216a63b8
authored
Mar 03, 2025
by
Azure
Committed by
GitHub
Mar 03, 2025
Browse files
Merge pull request #754 from moonshadow-25/dev023
Support for IQ1_S(Dynamic 1.58-bit)
parents
798e1d0c
d24d3693
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1870 additions
and
89 deletions
+1870
-89
third_party/llamafile/iqk_mul_mat.inc
third_party/llamafile/iqk_mul_mat.inc
+1854
-75
third_party/llamafile/sgemm.h
third_party/llamafile/sgemm.h
+8
-3
third_party/llamafile/tinyblas_cpu_sgemm.inc
third_party/llamafile/tinyblas_cpu_sgemm.inc
+8
-11
No files found.
third_party/llamafile/iqk_mul_mat.inc
View file @
216a63b8
...
@@ -69,6 +69,10 @@
...
@@ -69,6 +69,10 @@
#endif
#endif
constexpr
ggml_type
GGML_TYPE_Q8_0_X4
=
static_cast
<
ggml_type
>
(
98
);
constexpr
ggml_type
GGML_TYPE_Q8_1_X4
=
static_cast
<
ggml_type
>
(
99
);
namespace
{
namespace
{
typedef
struct
{
typedef
struct
{
...
@@ -106,13 +110,36 @@ struct DataInfo {
...
@@ -106,13 +110,36 @@ struct DataInfo {
}
}
};
};
/*
moonll
change param for set_mul_mat
add func16
*/
typedef
void
(
*
mul_mat_t
)(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
);
typedef
void
(
*
mul_mat_t
)(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
);
struct
MulMat
{
struct
MulMat
{
std
::
array
<
mul_mat_t
,
8
>
funcs
=
{};
std
::
array
<
mul_mat_t
,
8
>
funcs
=
{};
mul_mat_t
func16
=
nullptr
;
//inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
//inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
IQK_NOINLINE
void
mul_mat_NxM
(
int
n
,
const
void
*
vx
,
size_t
bx
,
DataInfo
&
info
,
int
nrc_x
,
int
nrc_y
)
{
IQK_NOINLINE
void
mul_mat_NxM
(
int
n
,
const
void
*
vx
,
size_t
bx
,
DataInfo
&
info
,
int
nrc_x
,
int
nrc_y
)
{
constexpr
int
k_x_step
=
64
;
// This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
constexpr
int
k_x_step
=
64
;
// This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
if
(
func16
&&
nrc_y
>=
16
)
{
int
n_step
=
(
nrc_y
-
info
.
cur_y
)
/
16
;
for
(
int
ix
=
0
;
ix
<
nrc_x
;
ix
+=
k_x_step
)
{
auto
this_info
=
info
;
this_info
.
s
+=
ix
;
int
this_nrc_x
=
ix
+
k_x_step
<=
nrc_x
?
k_x_step
:
nrc_x
-
ix
;
for
(
int
iy
=
0
;
iy
<
n_step
;
++
iy
)
{
func16
(
n
,
(
const
void
*
)((
const
char
*
)
vx
+
ix
*
bx
),
bx
,
this_info
,
this_nrc_x
);
this_info
.
cur_y
+=
16
;
}
}
info
.
cur_y
+=
16
*
n_step
;
if
(
info
.
cur_y
==
nrc_y
)
return
;
}
int
n_step
=
(
nrc_y
-
info
.
cur_y
)
/
funcs
.
size
();
int
n_step
=
(
nrc_y
-
info
.
cur_y
)
/
funcs
.
size
();
if
(
n_step
>
0
)
{
if
(
n_step
>
0
)
{
for
(
int
ix
=
0
;
ix
<
nrc_x
;
ix
+=
k_x_step
)
{
for
(
int
ix
=
0
;
ix
<
nrc_x
;
ix
+=
k_x_step
)
{
...
@@ -131,7 +158,7 @@ struct MulMat {
...
@@ -131,7 +158,7 @@ struct MulMat {
funcs
[
n_left
-
1
](
n
,
vx
,
bx
,
info
,
nrc_x
);
funcs
[
n_left
-
1
](
n
,
vx
,
bx
,
info
,
nrc_x
);
}
}
}
}
static
IQK_NOINLINE
bool
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
Ny
);
static
IQK_NOINLINE
bool
set_mul_mat
(
int
typeA
,
int
typeB
,
int
ne00
,
MulMat
&
mm
,
int
Ny
);
private
:
private
:
template
<
typename
Dequantizer
>
static
IQK_NOINLINE
void
set_functions
(
MulMat
&
m
);
template
<
typename
Dequantizer
>
static
IQK_NOINLINE
void
set_functions
(
MulMat
&
m
);
};
};
...
@@ -147,6 +174,787 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
...
@@ -147,6 +174,787 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
aux32
[
0
]
=
a0
&
0x3f3f3f3f
;
aux32
[
0
]
=
a0
&
0x3f3f3f3f
;
}
}
/*
moonll
decoding tables
*/
#ifdef __AVX2__
static
const
uint64_t
iq1s_grid_us
[
2048
]
=
{
0x0000000000000000
,
0x0000000000000002
,
0x0000000000000101
,
0x0000000000000200
,
0x0000000000000202
,
0x0000000000010001
,
0x0000000000010101
,
0x0000000000020000
,
0x0000000000020002
,
0x0000000000020200
,
0x0000000000020202
,
0x0000000001000101
,
0x0000000001010001
,
0x0000000001010100
,
0x0000000001010102
,
0x0000000001020101
,
0x0000000002000000
,
0x0000000002000002
,
0x0000000002000200
,
0x0000000002000202
,
0x0000000002010101
,
0x0000000002020000
,
0x0000000002020002
,
0x0000000002020200
,
0x0000000002020202
,
0x0000000100000100
,
0x0000000100000101
,
0x0000000100010001
,
0x0000000100010100
,
0x0000000100010102
,
0x0000000100010201
,
0x0000000100010202
,
0x0000000100020101
,
0x0000000101000001
,
0x0000000101000102
,
0x0000000101000201
,
0x0000000101010002
,
0x0000000101010101
,
0x0000000101010202
,
0x0000000101020001
,
0x0000000101020100
,
0x0000000101020102
,
0x0000000101020200
,
0x0000000102000101
,
0x0000000102010001
,
0x0000000102010100
,
0x0000000102010102
,
0x0000000102020101
,
0x0000000200000000
,
0x0000000200000002
,
0x0000000200000200
,
0x0000000200000202
,
0x0000000200010101
,
0x0000000200020000
,
0x0000000200020002
,
0x0000000200020200
,
0x0000000200020202
,
0x0000000201000101
,
0x0000000201010001
,
0x0000000201010201
,
0x0000000201020100
,
0x0000000201020201
,
0x0000000202000000
,
0x0000000202000002
,
0x0000000202000200
,
0x0000000202000202
,
0x0000000202010001
,
0x0000000202010101
,
0x0000000202010201
,
0x0000000202020000
,
0x0000000202020002
,
0x0000000202020200
,
0x0000000202020202
,
0x0000010000010001
,
0x0000010000010100
,
0x0000010000010102
,
0x0000010000020101
,
0x0000010001000001
,
0x0000010001000201
,
0x0000010001010101
,
0x0000010001010202
,
0x0000010001020100
,
0x0000010001020101
,
0x0000010002010001
,
0x0000010002010201
,
0x0000010002020101
,
0x0000010100000001
,
0x0000010100000100
,
0x0000010100000101
,
0x0000010100000102
,
0x0000010100010101
,
0x0000010100010200
,
0x0000010100010202
,
0x0000010100020201
,
0x0000010101000000
,
0x0000010101000101
,
0x0000010101000202
,
0x0000010101010000
,
0x0000010101010001
,
0x0000010101010100
,
0x0000010101010101
,
0x0000010101010102
,
0x0000010101010201
,
0x0000010101020000
,
0x0000010101020002
,
0x0000010101020101
,
0x0000010101020200
,
0x0000010101020202
,
0x0000010102000001
,
0x0000010102010001
,
0x0000010102010101
,
0x0000010102010200
,
0x0000010102010202
,
0x0000010102020001
,
0x0000010102020100
,
0x0000010102020101
,
0x0000010102020102
,
0x0000010102020201
,
0x0000010200010100
,
0x0000010200010201
,
0x0000010201000001
,
0x0000010201000100
,
0x0000010201010000
,
0x0000010201010002
,
0x0000010201010101
,
0x0000010201010200
,
0x0000010201020000
,
0x0000010201020001
,
0x0000010201020102
,
0x0000010201020201
,
0x0000010202000101
,
0x0000010202010001
,
0x0000010202010100
,
0x0000010202010201
,
0x0000020000000000
,
0x0000020000000002
,
0x0000020000000200
,
0x0000020000000202
,
0x0000020000010101
,
0x0000020000020000
,
0x0000020000020002
,
0x0000020000020200
,
0x0000020000020202
,
0x0000020001000101
,
0x0000020001010001
,
0x0000020001010102
,
0x0000020001020101
,
0x0000020002000000
,
0x0000020002000002
,
0x0000020002000200
,
0x0000020002000202
,
0x0000020002010101
,
0x0000020002020000
,
0x0000020002020002
,
0x0000020002020200
,
0x0000020002020202
,
0x0000020100000101
,
0x0000020100010001
,
0x0000020100010100
,
0x0000020100010201
,
0x0000020100020100
,
0x0000020100020101
,
0x0000020101000001
,
0x0000020101010000
,
0x0000020101010001
,
0x0000020101010101
,
0x0000020101020001
,
0x0000020101020100
,
0x0000020101020201
,
0x0000020102010001
,
0x0000020102010100
,
0x0000020102010102
,
0x0000020102010201
,
0x0000020102020101
,
0x0000020200000000
,
0x0000020200000002
,
0x0000020200000200
,
0x0000020200000202
,
0x0000020200010101
,
0x0000020200020000
,
0x0000020200020002
,
0x0000020200020200
,
0x0000020200020202
,
0x0000020201000101
,
0x0000020201010001
,
0x0000020201010201
,
0x0000020201020001
,
0x0000020201020101
,
0x0000020202000000
,
0x0000020202000002
,
0x0000020202000101
,
0x0000020202000200
,
0x0000020202000202
,
0x0000020202010101
,
0x0000020202020000
,
0x0000020202020002
,
0x0000020202020200
,
0x0000020202020202
,
0x0001000000010000
,
0x0001000000010001
,
0x0001000000010100
,
0x0001000000010201
,
0x0001000000020100
,
0x0001000000020101
,
0x0001000001000001
,
0x0001000001000100
,
0x0001000001010000
,
0x0001000001010101
,
0x0001000001010200
,
0x0001000001020001
,
0x0001000001020100
,
0x0001000001020101
,
0x0001000001020201
,
0x0001000002010001
,
0x0001000002010100
,
0x0001000002010102
,
0x0001000002020001
,
0x0001000002020101
,
0x0001000100000001
,
0x0001000100000100
,
0x0001000100000102
,
0x0001000100000201
,
0x0001000100010000
,
0x0001000100010002
,
0x0001000100010101
,
0x0001000100010200
,
0x0001000100020001
,
0x0001000100020100
,
0x0001000100020201
,
0x0001000101000101
,
0x0001000101000202
,
0x0001000101010000
,
0x0001000101010001
,
0x0001000101010002
,
0x0001000101010100
,
0x0001000101010101
,
0x0001000101010102
,
0x0001000101010201
,
0x0001000101020000
,
0x0001000101020101
,
0x0001000102000100
,
0x0001000102010002
,
0x0001000102010101
,
0x0001000102020001
,
0x0001000102020100
,
0x0001000200010001
,
0x0001000200010100
,
0x0001000200010102
,
0x0001000200020101
,
0x0001000201000000
,
0x0001000201000102
,
0x0001000201000201
,
0x0001000201010002
,
0x0001000201010101
,
0x0001000201010200
,
0x0001000201010202
,
0x0001000201020100
,
0x0001000201020102
,
0x0001000202000101
,
0x0001000202010001
,
0x0001000202010100
,
0x0001000202010102
,
0x0001000202020101
,
0x0001010000000001
,
0x0001010000000102
,
0x0001010000000201
,
0x0001010000010100
,
0x0001010000010101
,
0x0001010000010200
,
0x0001010000010201
,
0x0001010000020001
,
0x0001010000020102
,
0x0001010001000001
,
0x0001010001000101
,
0x0001010001000102
,
0x0001010001000200
,
0x0001010001000202
,
0x0001010001010001
,
0x0001010001010100
,
0x0001010001010101
,
0x0001010001010102
,
0x0001010001010201
,
0x0001010001020002
,
0x0001010001020101
,
0x0001010001020200
,
0x0001010002000100
,
0x0001010002000201
,
0x0001010002010000
,
0x0001010002010100
,
0x0001010002010101
,
0x0001010002010200
,
0x0001010002010201
,
0x0001010002010202
,
0x0001010002020001
,
0x0001010002020100
,
0x0001010002020101
,
0x0001010002020201
,
0x0001010100000002
,
0x0001010100000101
,
0x0001010100000202
,
0x0001010100010001
,
0x0001010100010100
,
0x0001010100010101
,
0x0001010100010102
,
0x0001010100010201
,
0x0001010100020000
,
0x0001010100020002
,
0x0001010100020101
,
0x0001010100020200
,
0x0001010100020202
,
0x0001010101000001
,
0x0001010101000100
,
0x0001010101000101
,
0x0001010101000102
,
0x0001010101010001
,
0x0001010101010002
,
0x0001010101010100
,
0x0001010101010101
,
0x0001010101010102
,
0x0001010101010201
,
0x0001010101010202
,
0x0001010101020001
,
0x0001010101020100
,
0x0001010101020101
,
0x0001010101020102
,
0x0001010101020201
,
0x0001010102000000
,
0x0001010102000002
,
0x0001010102000100
,
0x0001010102000101
,
0x0001010102000200
,
0x0001010102000202
,
0x0001010102010000
,
0x0001010102010001
,
0x0001010102010100
,
0x0001010102010101
,
0x0001010102010102
,
0x0001010102010201
,
0x0001010102010202
,
0x0001010102020000
,
0x0001010102020002
,
0x0001010102020101
,
0x0001010200000001
,
0x0001010200000100
,
0x0001010200000101
,
0x0001010200000102
,
0x0001010200010101
,
0x0001010200010102
,
0x0001010200010200
,
0x0001010200010202
,
0x0001010200020001
,
0x0001010200020102
,
0x0001010201000000
,
0x0001010201000002
,
0x0001010201000100
,
0x0001010201000101
,
0x0001010201000200
,
0x0001010201000202
,
0x0001010201010001
,
0x0001010201010101
,
0x0001010201010102
,
0x0001010201010200
,
0x0001010201010201
,
0x0001010201020001
,
0x0001010201020100
,
0x0001010201020101
,
0x0001010201020200
,
0x0001010201020201
,
0x0001010201020202
,
0x0001010202000102
,
0x0001010202000202
,
0x0001010202010002
,
0x0001010202010101
,
0x0001010202020100
,
0x0001010202020201
,
0x0001020000010001
,
0x0001020000010102
,
0x0001020000020101
,
0x0001020001000001
,
0x0001020001000100
,
0x0001020001000102
,
0x0001020001000201
,
0x0001020001010000
,
0x0001020001010101
,
0x0001020001010200
,
0x0001020001010202
,
0x0001020001020000
,
0x0001020001020001
,
0x0001020001020100
,
0x0001020001020102
,
0x0001020001020201
,
0x0001020002000101
,
0x0001020002010001
,
0x0001020002010100
,
0x0001020002020101
,
0x0001020100010000
,
0x0001020100010002
,
0x0001020100010101
,
0x0001020100010202
,
0x0001020100020001
,
0x0001020100020101
,
0x0001020101000002
,
0x0001020101000100
,
0x0001020101000101
,
0x0001020101000200
,
0x0001020101010001
,
0x0001020101010100
,
0x0001020101010101
,
0x0001020101010102
,
0x0001020101010201
,
0x0001020101010202
,
0x0001020101020000
,
0x0001020101020101
,
0x0001020101020202
,
0x0001020102000201
,
0x0001020102010001
,
0x0001020102010002
,
0x0001020102010101
,
0x0001020102010200
,
0x0001020102020001
,
0x0001020102020102
,
0x0001020102020201
,
0x0001020200000201
,
0x0001020200010102
,
0x0001020200020100
,
0x0001020200020102
,
0x0001020201000100
,
0x0001020201000102
,
0x0001020201000201
,
0x0001020201010000
,
0x0001020201010002
,
0x0001020201010101
,
0x0001020201010200
,
0x0001020201020001
,
0x0001020201020102
,
0x0001020201020201
,
0x0001020202000101
,
0x0001020202010001
,
0x0001020202010102
,
0x0001020202010202
,
0x0002000000000000
,
0x0002000000000002
,
0x0002000000000200
,
0x0002000000000202
,
0x0002000000010101
,
0x0002000000020000
,
0x0002000000020002
,
0x0002000000020101
,
0x0002000000020200
,
0x0002000000020202
,
0x0002000001000101
,
0x0002000001010001
,
0x0002000001010201
,
0x0002000001020001
,
0x0002000001020101
,
0x0002000002000000
,
0x0002000002000002
,
0x0002000002000200
,
0x0002000002000202
,
0x0002000002010101
,
0x0002000002020000
,
0x0002000002020002
,
0x0002000002020101
,
0x0002000002020200
,
0x0002000002020202
,
0x0002000100000101
,
0x0002000100010001
,
0x0002000100010100
,
0x0002000100010201
,
0x0002000100020101
,
0x0002000101000002
,
0x0002000101000100
,
0x0002000101000201
,
0x0002000101010101
,
0x0002000101010200
,
0x0002000101010202
,
0x0002000101020001
,
0x0002000101020100
,
0x0002000101020101
,
0x0002000101020102
,
0x0002000102000101
,
0x0002000102010000
,
0x0002000102010102
,
0x0002000102010201
,
0x0002000102020101
,
0x0002000200000001
,
0x0002000200000200
,
0x0002000200000202
,
0x0002000200010001
,
0x0002000200010101
,
0x0002000200020000
,
0x0002000200020002
,
0x0002000200020200
,
0x0002000200020202
,
0x0002000201000101
,
0x0002000201010001
,
0x0002000201010102
,
0x0002000201010201
,
0x0002000201020101
,
0x0002000202000001
,
0x0002000202000200
,
0x0002000202000202
,
0x0002000202010001
,
0x0002000202010101
,
0x0002000202020000
,
0x0002000202020002
,
0x0002000202020200
,
0x0002000202020202
,
0x0002010000000101
,
0x0002010000010100
,
0x0002010000010102
,
0x0002010000010201
,
0x0002010000020101
,
0x0002010001000100
,
0x0002010001000101
,
0x0002010001000102
,
0x0002010001000201
,
0x0002010001010002
,
0x0002010001010101
,
0x0002010001010200
,
0x0002010001010202
,
0x0002010001020102
,
0x0002010002000101
,
0x0002010002010001
,
0x0002010002010100
,
0x0002010002010201
,
0x0002010002020001
,
0x0002010002020101
,
0x0002010100000201
,
0x0002010100010101
,
0x0002010100020001
,
0x0002010100020201
,
0x0002010101000000
,
0x0002010101000101
,
0x0002010101000200
,
0x0002010101010001
,
0x0002010101010100
,
0x0002010101010101
,
0x0002010101010201
,
0x0002010101020002
,
0x0002010101020101
,
0x0002010101020200
,
0x0002010102000201
,
0x0002010102010000
,
0x0002010102010100
,
0x0002010102010101
,
0x0002010102010200
,
0x0002010102010202
,
0x0002010102020001
,
0x0002010102020100
,
0x0002010102020102
,
0x0002010102020201
,
0x0002010200000101
,
0x0002010200010000
,
0x0002010200010002
,
0x0002010200010201
,
0x0002010200020101
,
0x0002010201000001
,
0x0002010201000201
,
0x0002010201010101
,
0x0002010201020000
,
0x0002010201020001
,
0x0002010201020201
,
0x0002010202000100
,
0x0002010202000102
,
0x0002010202010000
,
0x0002010202010202
,
0x0002020000000000
,
0x0002020000000002
,
0x0002020000000200
,
0x0002020000000202
,
0x0002020000010101
,
0x0002020000020000
,
0x0002020000020002
,
0x0002020000020200
,
0x0002020000020202
,
0x0002020001000101
,
0x0002020001010001
,
0x0002020001010100
,
0x0002020001020101
,
0x0002020002000000
,
0x0002020002000002
,
0x0002020002000200
,
0x0002020002000202
,
0x0002020002020000
,
0x0002020002020002
,
0x0002020002020200
,
0x0002020002020202
,
0x0002020100000201
,
0x0002020100010001
,
0x0002020100010100
,
0x0002020100010201
,
0x0002020100020101
,
0x0002020101000102
,
0x0002020101000201
,
0x0002020101010002
,
0x0002020101010101
,
0x0002020101020001
,
0x0002020101020100
,
0x0002020101020102
,
0x0002020101020201
,
0x0002020102000101
,
0x0002020102010000
,
0x0002020102010102
,
0x0002020102010201
,
0x0002020102020100
,
0x0002020102020101
,
0x0002020200000000
,
0x0002020200000002
,
0x0002020200000200
,
0x0002020200000202
,
0x0002020200020000
,
0x0002020200020002
,
0x0002020200020200
,
0x0002020200020202
,
0x0002020201000101
,
0x0002020201010001
,
0x0002020201010102
,
0x0002020201010201
,
0x0002020201020101
,
0x0002020202000000
,
0x0002020202000002
,
0x0002020202000200
,
0x0002020202000202
,
0x0002020202010101
,
0x0002020202020000
,
0x0002020202020002
,
0x0002020202020200
,
0x0002020202020202
,
0x0100000000000101
,
0x0100000000010001
,
0x0100000000010102
,
0x0100000000020101
,
0x0100000001000201
,
0x0100000001010002
,
0x0100000001010101
,
0x0100000001010200
,
0x0100000001010202
,
0x0100000001020001
,
0x0100000001020100
,
0x0100000001020102
,
0x0100000002010100
,
0x0100000002010201
,
0x0100000002020001
,
0x0100000002020102
,
0x0100000100000000
,
0x0100000100000001
,
0x0100000100000100
,
0x0100000100000102
,
0x0100000100000201
,
0x0100000100010002
,
0x0100000100010101
,
0x0100000100010102
,
0x0100000100010200
,
0x0100000100010202
,
0x0100000100020001
,
0x0100000100020102
,
0x0100000100020201
,
0x0100000101000101
,
0x0100000101000200
,
0x0100000101000202
,
0x0100000101010001
,
0x0100000101010100
,
0x0100000101010101
,
0x0100000101010102
,
0x0100000101010201
,
0x0100000101010202
,
0x0100000101020101
,
0x0100000101020200
,
0x0100000101020202
,
0x0100000102000001
,
0x0100000102000100
,
0x0100000102000102
,
0x0100000102010000
,
0x0100000102010002
,
0x0100000102010101
,
0x0100000102020000
,
0x0100000102020001
,
0x0100000102020002
,
0x0100000200000101
,
0x0100000200010001
,
0x0100000200010100
,
0x0100000200010102
,
0x0100000200020101
,
0x0100000201000001
,
0x0100000201010002
,
0x0100000201010101
,
0x0100000201010202
,
0x0100000201020100
,
0x0100000201020201
,
0x0100000202000201
,
0x0100000202010100
,
0x0100000202020101
,
0x0100010000000001
,
0x0100010000010101
,
0x0100010000010201
,
0x0100010000020201
,
0x0100010001000101
,
0x0100010001000200
,
0x0100010001000202
,
0x0100010001010001
,
0x0100010001010100
,
0x0100010001010101
,
0x0100010001010102
,
0x0100010001020001
,
0x0100010001020002
,
0x0100010001020101
,
0x0100010001020200
,
0x0100010001020202
,
0x0100010002000001
,
0x0100010002000102
,
0x0100010002000201
,
0x0100010002010000
,
0x0100010002010002
,
0x0100010002010101
,
0x0100010002020000
,
0x0100010002020001
,
0x0100010002020201
,
0x0100010100000001
,
0x0100010100000002
,
0x0100010100000101
,
0x0100010100000202
,
0x0100010100010001
,
0x0100010100010100
,
0x0100010100010101
,
0x0100010100010102
,
0x0100010100010201
,
0x0100010100020000
,
0x0100010100020101
,
0x0100010100020202
,
0x0100010101000001
,
0x0100010101000100
,
0x0100010101000101
,
0x0100010101000102
,
0x0100010101000201
,
0x0100010101010000
,
0x0100010101010001
,
0x0100010101010100
,
0x0100010101010101
,
0x0100010101010102
,
0x0100010101010200
,
0x0100010101010201
,
0x0100010101020001
,
0x0100010101020100
,
0x0100010101020101
,
0x0100010101020102
,
0x0100010101020201
,
0x0100010102000002
,
0x0100010102000100
,
0x0100010102000101
,
0x0100010102000200
,
0x0100010102010001
,
0x0100010102010100
,
0x0100010102010101
,
0x0100010102010102
,
0x0100010102010201
,
0x0100010102010202
,
0x0100010102020101
,
0x0100010102020200
,
0x0100010102020202
,
0x0100010200000001
,
0x0100010200000101
,
0x0100010200000201
,
0x0100010200010100
,
0x0100010200010101
,
0x0100010200010200
,
0x0100010200010202
,
0x0100010200020001
,
0x0100010200020100
,
0x0100010200020201
,
0x0100010201000000
,
0x0100010201000002
,
0x0100010201000101
,
0x0100010201000200
,
0x0100010201010000
,
0x0100010201010001
,
0x0100010201010002
,
0x0100010201010101
,
0x0100010201010102
,
0x0100010201010201
,
0x0100010201020002
,
0x0100010201020101
,
0x0100010201020200
,
0x0100010202000001
,
0x0100010202000101
,
0x0100010202000202
,
0x0100010202010100
,
0x0100010202010101
,
0x0100010202020001
,
0x0100010202020100
,
0x0100010202020102
,
0x0100020000000101
,
0x0100020000010001
,
0x0100020000010101
,
0x0100020000010202
,
0x0100020000020101
,
0x0100020001000002
,
0x0100020001000201
,
0x0100020001010000
,
0x0100020001010101
,
0x0100020001010200
,
0x0100020001020001
,
0x0100020001020100
,
0x0100020001020102
,
0x0100020001020201
,
0x0100020002000101
,
0x0100020002010001
,
0x0100020002010100
,
0x0100020002010102
,
0x0100020002010201
,
0x0100020002020101
,
0x0100020100000001
,
0x0100020100000101
,
0x0100020100000102
,
0x0100020100000202
,
0x0100020100010000
,
0x0100020100010100
,
0x0100020100010101
,
0x0100020100010200
,
0x0100020100020001
,
0x0100020100020100
,
0x0100020100020102
,
0x0100020101000000
,
0x0100020101000101
,
0x0100020101000202
,
0x0100020101010001
,
0x0100020101010002
,
0x0100020101010100
,
0x0100020101010101
,
0x0100020101010102
,
0x0100020101010201
,
0x0100020101020000
,
0x0100020101020002
,
0x0100020101020101
,
0x0100020101020102
,
0x0100020101020202
,
0x0100020102000102
,
0x0100020102000201
,
0x0100020102010002
,
0x0100020102010101
,
0x0100020102010102
,
0x0100020102010200
,
0x0100020102020001
,
0x0100020102020100
,
0x0100020102020102
,
0x0100020102020201
,
0x0100020200010102
,
0x0100020201000100
,
0x0100020201000102
,
0x0100020201000201
,
0x0100020201010101
,
0x0100020201010200
,
0x0100020201010202
,
0x0100020201020100
,
0x0100020201020201
,
0x0100020202010100
,
0x0100020202020101
,
0x0101000000000001
,
0x0101000000000100
,
0x0101000000000101
,
0x0101000000000102
,
0x0101000000000201
,
0x0101000000010002
,
0x0101000000010101
,
0x0101000000010202
,
0x0101000000020001
,
0x0101000000020100
,
0x0101000000020201
,
0x0101000001000000
,
0x0101000001000101
,
0x0101000001000200
,
0x0101000001010001
,
0x0101000001010100
,
0x0101000001010101
,
0x0101000001010102
,
0x0101000001010201
,
0x0101000001020101
,
0x0101000001020200
,
0x0101000002000102
,
0x0101000002000201
,
0x0101000002010101
,
0x0101000002010200
,
0x0101000002020000
,
0x0101000002020001
,
0x0101000002020102
,
0x0101000002020201
,
0x0101000100000101
,
0x0101000100000200
,
0x0101000100000201
,
0x0101000100000202
,
0x0101000100010001
,
0x0101000100010100
,
0x0101000100010101
,
0x0101000100010102
,
0x0101000100010200
,
0x0101000100010201
,
0x0101000100020000
,
0x0101000100020101
,
0x0101000100020102
,
0x0101000100020200
,
0x0101000100020202
,
0x0101000101000001
,
0x0101000101000100
,
0x0101000101000101
,
0x0101000101000102
,
0x0101000101000201
,
0x0101000101010000
,
0x0101000101010001
,
0x0101000101010002
,
0x0101000101010100
,
0x0101000101010101
,
0x0101000101010102
,
0x0101000101010200
,
0x0101000101010201
,
0x0101000101010202
,
0x0101000101020001
,
0x0101000101020100
,
0x0101000101020101
,
0x0101000101020102
,
0x0101000101020201
,
0x0101000102000002
,
0x0101000102000101
,
0x0101000102010001
,
0x0101000102010100
,
0x0101000102010101
,
0x0101000102010102
,
0x0101000102010201
,
0x0101000102020000
,
0x0101000102020101
,
0x0101000102020202
,
0x0101000200000001
,
0x0101000200000102
,
0x0101000200010002
,
0x0101000200010101
,
0x0101000200010202
,
0x0101000200020001
,
0x0101000200020100
,
0x0101000201000002
,
0x0101000201000101
,
0x0101000201000202
,
0x0101000201010001
,
0x0101000201010100
,
0x0101000201010101
,
0x0101000201010102
,
0x0101000201010201
,
0x0101000201020002
,
0x0101000201020101
,
0x0101000202000101
,
0x0101000202010000
,
0x0101000202010002
,
0x0101000202010101
,
0x0101000202010201
,
0x0101000202010202
,
0x0101000202020100
,
0x0101010000000100
,
0x0101010000000101
,
0x0101010000010001
,
0x0101010000010100
,
0x0101010000010101
,
0x0101010000010102
,
0x0101010000010200
,
0x0101010000010201
,
0x0101010000020001
,
0x0101010000020101
,
0x0101010000020200
,
0x0101010000020202
,
0x0101010001000001
,
0x0101010001000100
,
0x0101010001000101
,
0x0101010001000102
,
0x0101010001000201
,
0x0101010001000202
,
0x0101010001010000
,
0x0101010001010001
,
0x0101010001010100
,
0x0101010001010101
,
0x0101010001010102
,
0x0101010001010200
,
0x0101010001010201
,
0x0101010001010202
,
0x0101010001020001
,
0x0101010001020002
,
0x0101010001020100
,
0x0101010001020101
,
0x0101010001020102
,
0x0101010001020201
,
0x0101010002000000
,
0x0101010002000200
,
0x0101010002000202
,
0x0101010002010001
,
0x0101010002010100
,
0x0101010002010101
,
0x0101010002010102
,
0x0101010002010201
,
0x0101010002020001
,
0x0101010002020100
,
0x0101010002020101
,
0x0101010002020202
,
0x0101010100000001
,
0x0101010100000002
,
0x0101010100000100
,
0x0101010100000101
,
0x0101010100000102
,
0x0101010100000201
,
0x0101010100010000
,
0x0101010100010001
,
0x0101010100010002
,
0x0101010100010100
,
0x0101010100010101
,
0x0101010100010102
,
0x0101010100010201
,
0x0101010100010202
,
0x0101010100020001
,
0x0101010100020100
,
0x0101010100020101
,
0x0101010100020102
,
0x0101010100020201
,
0x0101010101000000
,
0x0101010101000001
,
0x0101010101000002
,
0x0101010101000100
,
0x0101010101000101
,
0x0101010101000102
,
0x0101010101000200
,
0x0101010101000201
,
0x0101010101010000
,
0x0101010101010001
,
0x0101010101010002
,
0x0101010101010100
,
0x0101010101010101
,
0x0101010101010102
,
0x0101010101010200
,
0x0101010101010201
,
0x0101010101010202
,
0x0101010101020000
,
0x0101010101020001
,
0x0101010101020100
,
0x0101010101020101
,
0x0101010101020102
,
0x0101010101020200
,
0x0101010101020201
,
0x0101010101020202
,
0x0101010102000001
,
0x0101010102000100
,
0x0101010102000101
,
0x0101010102000201
,
0x0101010102000202
,
0x0101010102010000
,
0x0101010102010001
,
0x0101010102010100
,
0x0101010102010101
,
0x0101010102010102
,
0x0101010102010200
,
0x0101010102010201
,
0x0101010102020001
,
0x0101010102020100
,
0x0101010102020101
,
0x0101010102020102
,
0x0101010102020201
,
0x0101010200000000
,
0x0101010200000001
,
0x0101010200000002
,
0x0101010200000100
,
0x0101010200000102
,
0x0101010200000200
,
0x0101010200000201
,
0x0101010200010001
,
0x0101010200010100
,
0x0101010200010101
,
0x0101010200010200
,
0x0101010200010201
,
0x0101010200020000
,
0x0101010200020001
,
0x0101010200020002
,
0x0101010200020100
,
0x0101010200020101
,
0x0101010200020102
,
0x0101010200020200
,
0x0101010200020201
,
0x0101010201000001
,
0x0101010201000101
,
0x0101010201000102
,
0x0101010201000200
,
0x0101010201000201
,
0x0101010201000202
,
0x0101010201010000
,
0x0101010201010001
,
0x0101010201010002
,
0x0101010201010100
,
0x0101010201010101
,
0x0101010201010102
,
0x0101010201010200
,
0x0101010201010201
,
0x0101010201010202
,
0x0101010201020001
,
0x0101010201020100
,
0x0101010201020101
,
0x0101010201020201
,
0x0101010202000002
,
0x0101010202000101
,
0x0101010202000102
,
0x0101010202000200
,
0x0101010202000201
,
0x0101010202000202
,
0x0101010202010001
,
0x0101010202010101
,
0x0101010202010202
,
0x0101010202020002
,
0x0101010202020101
,
0x0101010202020102
,
0x0101010202020200
,
0x0101010202020201
,
0x0101020000000100
,
0x0101020000000101
,
0x0101020000000102
,
0x0101020000000201
,
0x0101020000010000
,
0x0101020000010101
,
0x0101020000010200
,
0x0101020000020001
,
0x0101020000020202
,
0x0101020001000101
,
0x0101020001000200
,
0x0101020001000202
,
0x0101020001010001
,
0x0101020001010100
,
0x0101020001010101
,
0x0101020001010102
,
0x0101020001010200
,
0x0101020001010201
,
0x0101020001020000
,
0x0101020001020002
,
0x0101020001020100
,
0x0101020001020101
,
0x0101020002000002
,
0x0101020002000201
,
0x0101020002010000
,
0x0101020002010002
,
0x0101020002010101
,
0x0101020002010200
,
0x0101020002020001
,
0x0101020002020201
,
0x0101020100000001
,
0x0101020100000002
,
0x0101020100000101
,
0x0101020100000202
,
0x0101020100010001
,
0x0101020100010100
,
0x0101020100010101
,
0x0101020100010102
,
0x0101020100010201
,
0x0101020100020101
,
0x0101020101000001
,
0x0101020101000100
,
0x0101020101000101
,
0x0101020101000102
,
0x0101020101000201
,
0x0101020101010000
,
0x0101020101010001
,
0x0101020101010002
,
0x0101020101010100
,
0x0101020101010101
,
0x0101020101010102
,
0x0101020101010200
,
0x0101020101010201
,
0x0101020101010202
,
0x0101020101020001
,
0x0101020101020100
,
0x0101020101020101
,
0x0101020101020102
,
0x0101020101020201
,
0x0101020102000001
,
0x0101020102000101
,
0x0101020102000201
,
0x0101020102010001
,
0x0101020102010100
,
0x0101020102010101
,
0x0101020102010102
,
0x0101020102010200
,
0x0101020102010201
,
0x0101020102020101
,
0x0101020200000100
,
0x0101020200000200
,
0x0101020200010101
,
0x0101020200010202
,
0x0101020200020000
,
0x0101020200020101
,
0x0101020200020102
,
0x0101020200020201
,
0x0101020201000101
,
0x0101020201000200
,
0x0101020201000201
,
0x0101020201010001
,
0x0101020201010101
,
0x0101020201010102
,
0x0101020201010200
,
0x0101020201010201
,
0x0101020201020002
,
0x0101020201020101
,
0x0101020201020200
,
0x0101020201020202
,
0x0101020202000001
,
0x0101020202000202
,
0x0101020202010002
,
0x0101020202010101
,
0x0101020202010102
,
0x0101020202010200
,
0x0101020202010202
,
0x0101020202020001
,
0x0102000000000101
,
0x0102000000010100
,
0x0102000000010102
,
0x0102000000010201
,
0x0102000000020101
,
0x0102000001000100
,
0x0102000001010000
,
0x0102000001010101
,
0x0102000001010102
,
0x0102000001010200
,
0x0102000001010202
,
0x0102000001020001
,
0x0102000001020100
,
0x0102000001020102
,
0x0102000001020201
,
0x0102000002000001
,
0x0102000002010102
,
0x0102000002020101
,
0x0102000100000001
,
0x0102000100000100
,
0x0102000100000102
,
0x0102000100000201
,
0x0102000100010002
,
0x0102000100010101
,
0x0102000100020001
,
0x0102000100020002
,
0x0102000100020102
,
0x0102000100020201
,
0x0102000101000101
,
0x0102000101000201
,
0x0102000101010001
,
0x0102000101010101
,
0x0102000101010102
,
0x0102000101010201
,
0x0102000101020101
,
0x0102000101020102
,
0x0102000101020202
,
0x0102000102000100
,
0x0102000102000202
,
0x0102000102010002
,
0x0102000102010101
,
0x0102000102020001
,
0x0102000102020102
,
0x0102000102020201
,
0x0102000200010001
,
0x0102000200010102
,
0x0102000200010201
,
0x0102000201000000
,
0x0102000201000001
,
0x0102000201000102
,
0x0102000201010101
,
0x0102000201010102
,
0x0102000201010200
,
0x0102000201020000
,
0x0102000202000101
,
0x0102000202010001
,
0x0102000202010102
,
0x0102000202020101
,
0x0102010000010001
,
0x0102010000010002
,
0x0102010000010101
,
0x0102010000010102
,
0x0102010000010202
,
0x0102010000020001
,
0x0102010000020102
,
0x0102010000020201
,
0x0102010001000000
,
0x0102010001000002
,
0x0102010001000101
,
0x0102010001000200
,
0x0102010001000202
,
0x0102010001010001
,
0x0102010001010100
,
0x0102010001010101
,
0x0102010001010102
,
0x0102010001010201
,
0x0102010001010202
,
0x0102010001020000
,
0x0102010001020002
,
0x0102010001020101
,
0x0102010002000100
,
0x0102010002000101
,
0x0102010002000201
,
0x0102010002010000
,
0x0102010002010002
,
0x0102010002010100
,
0x0102010002010101
,
0x0102010002010102
,
0x0102010002010200
,
0x0102010002010202
,
0x0102010002020001
,
0x0102010002020100
,
0x0102010002020201
,
0x0102010100000101
,
0x0102010100000200
,
0x0102010100000202
,
0x0102010100010001
,
0x0102010100010101
,
0x0102010100010102
,
0x0102010100010201
,
0x0102010101000100
,
0x0102010101000101
,
0x0102010101000102
,
0x0102010101000201
,
0x0102010101010000
,
0x0102010101010001
,
0x0102010101010100
,
0x0102010101010101
,
0x0102010101010102
,
0x0102010101010201
,
0x0102010101020001
,
0x0102010101020100
,
0x0102010101020101
,
0x0102010101020102
,
0x0102010101020201
,
0x0102010102000102
,
0x0102010102000201
,
0x0102010102000202
,
0x0102010102010001
,
0x0102010102010101
,
0x0102010102010102
,
0x0102010102010201
,
0x0102010102010202
,
0x0102010102020002
,
0x0102010102020101
,
0x0102010102020102
,
0x0102010102020200
,
0x0102010200000002
,
0x0102010200000201
,
0x0102010200010101
,
0x0102010200020000
,
0x0102010200020102
,
0x0102010200020200
,
0x0102010200020201
,
0x0102010201000000
,
0x0102010201000101
,
0x0102010201000200
,
0x0102010201000202
,
0x0102010201010001
,
0x0102010201010100
,
0x0102010201010101
,
0x0102010201010102
,
0x0102010201010200
,
0x0102010201010202
,
0x0102010201020000
,
0x0102010201020101
,
0x0102010201020200
,
0x0102010202000000
,
0x0102010202000002
,
0x0102010202000101
,
0x0102010202000202
,
0x0102010202010100
,
0x0102010202010102
,
0x0102010202010200
,
0x0102010202010201
,
0x0102010202020000
,
0x0102010202020100
,
0x0102010202020102
,
0x0102010202020202
,
0x0102020000010102
,
0x0102020000010201
,
0x0102020000020101
,
0x0102020001000001
,
0x0102020001010002
,
0x0102020001010101
,
0x0102020001010202
,
0x0102020001020001
,
0x0102020001020201
,
0x0102020002000101
,
0x0102020002010001
,
0x0102020002010200
,
0x0102020002020102
,
0x0102020100000001
,
0x0102020100000100
,
0x0102020100010000
,
0x0102020100010101
,
0x0102020100020001
,
0x0102020100020100
,
0x0102020100020102
,
0x0102020100020201
,
0x0102020101000000
,
0x0102020101000001
,
0x0102020101000101
,
0x0102020101000102
,
0x0102020101000200
,
0x0102020101010001
,
0x0102020101010100
,
0x0102020101010101
,
0x0102020101010102
,
0x0102020101010201
,
0x0102020101020000
,
0x0102020101020101
,
0x0102020101020202
,
0x0102020102000002
,
0x0102020102000100
,
0x0102020102000202
,
0x0102020102010101
,
0x0102020102020001
,
0x0102020102020100
,
0x0102020102020101
,
0x0102020102020201
,
0x0102020200010001
,
0x0102020200010102
,
0x0102020200010200
,
0x0102020201000001
,
0x0102020201000100
,
0x0102020201000201
,
0x0102020201010000
,
0x0102020201010101
,
0x0102020201010200
,
0x0102020201010202
,
0x0102020201020100
,
0x0102020201020101
,
0x0102020201020201
,
0x0102020202000102
,
0x0102020202010100
,
0x0102020202010200
,
0x0102020202010202
,
0x0102020202020102
,
0x0200000000000000
,
0x0200000000000002
,
0x0200000000000200
,
0x0200000000000202
,
0x0200000000020000
,
0x0200000000020002
,
0x0200000000020200
,
0x0200000000020202
,
0x0200000001000101
,
0x0200000001010000
,
0x0200000001010001
,
0x0200000001010100
,
0x0200000001010102
,
0x0200000001010201
,
0x0200000001020101
,
0x0200000002000000
,
0x0200000002000002
,
0x0200000002000200
,
0x0200000002000202
,
0x0200000002010101
,
0x0200000002020000
,
0x0200000002020002
,
0x0200000002020200
,
0x0200000002020202
,
0x0200000100000101
,
0x0200000100010001
,
0x0200000100010100
,
0x0200000100010102
,
0x0200000100010201
,
0x0200000100020101
,
0x0200000101000001
,
0x0200000101000100
,
0x0200000101000201
,
0x0200000101010000
,
0x0200000101010002
,
0x0200000101010101
,
0x0200000101010102
,
0x0200000101010200
,
0x0200000101010201
,
0x0200000101020100
,
0x0200000101020102
,
0x0200000101020201
,
0x0200000102000101
,
0x0200000102000201
,
0x0200000102010100
,
0x0200000102010102
,
0x0200000102010201
,
0x0200000102020101
,
0x0200000200000000
,
0x0200000200000002
,
0x0200000200000200
,
0x0200000200000202
,
0x0200000200010101
,
0x0200000200020000
,
0x0200000200020002
,
0x0200000200020200
,
0x0200000200020202
,
0x0200000201010001
,
0x0200000201010100
,
0x0200000201010201
,
0x0200000201020101
,
0x0200000202000000
,
0x0200000202000002
,
0x0200000202000200
,
0x0200000202000202
,
0x0200000202010101
,
0x0200000202020000
,
0x0200000202020002
,
0x0200000202020200
,
0x0200000202020202
,
0x0200010000010100
,
0x0200010000010201
,
0x0200010001000001
,
0x0200010001000100
,
0x0200010001010001
,
0x0200010001010101
,
0x0200010001010202
,
0x0200010001020001
,
0x0200010001020100
,
0x0200010001020201
,
0x0200010002010100
,
0x0200010002010201
,
0x0200010100000001
,
0x0200010100000201
,
0x0200010100010002
,
0x0200010100010101
,
0x0200010100010202
,
0x0200010100020102
,
0x0200010100020201
,
0x0200010101000000
,
0x0200010101000001
,
0x0200010101000101
,
0x0200010101000200
,
0x0200010101010001
,
0x0200010101010100
,
0x0200010101010101
,
0x0200010101010102
,
0x0200010101010201
,
0x0200010101010202
,
0x0200010101020101
,
0x0200010101020102
,
0x0200010101020200
,
0x0200010101020202
,
0x0200010102000001
,
0x0200010102000100
,
0x0200010102000102
,
0x0200010102000201
,
0x0200010102010000
,
0x0200010102010002
,
0x0200010102010101
,
0x0200010102010200
,
0x0200010102020102
,
0x0200010200010001
,
0x0200010200010102
,
0x0200010200010201
,
0x0200010200020101
,
0x0200010201000001
,
0x0200010201000100
,
0x0200010201000201
,
0x0200010201000202
,
0x0200010201010000
,
0x0200010201010101
,
0x0200010201010201
,
0x0200010201010202
,
0x0200010201020001
,
0x0200010201020102
,
0x0200010201020202
,
0x0200010202000101
,
0x0200010202010001
,
0x0200010202010202
,
0x0200010202020100
,
0x0200020000000000
,
0x0200020000000002
,
0x0200020000000200
,
0x0200020000000202
,
0x0200020000010101
,
0x0200020000020000
,
0x0200020000020002
,
0x0200020000020200
,
0x0200020000020202
,
0x0200020001000001
,
0x0200020001000101
,
0x0200020001010001
,
0x0200020001010100
,
0x0200020001010201
,
0x0200020001020101
,
0x0200020001020201
,
0x0200020002000000
,
0x0200020002000002
,
0x0200020002000200
,
0x0200020002000202
,
0x0200020002010101
,
0x0200020002020000
,
0x0200020002020002
,
0x0200020002020200
,
0x0200020002020202
,
0x0200020100000101
,
0x0200020100000102
,
0x0200020100010001
,
0x0200020100010100
,
0x0200020100010102
,
0x0200020100020101
,
0x0200020101000001
,
0x0200020101000100
,
0x0200020101000102
,
0x0200020101000201
,
0x0200020101010000
,
0x0200020101010002
,
0x0200020101010101
,
0x0200020101010202
,
0x0200020101020001
,
0x0200020101020100
,
0x0200020102000101
,
0x0200020102010102
,
0x0200020102010201
,
0x0200020102020101
,
0x0200020200000000
,
0x0200020200000002
,
0x0200020200000200
,
0x0200020200000202
,
0x0200020200010101
,
0x0200020200020000
,
0x0200020200020002
,
0x0200020200020200
,
0x0200020200020202
,
0x0200020201000101
,
0x0200020201010001
,
0x0200020201010100
,
0x0200020201010102
,
0x0200020202000000
,
0x0200020202000002
,
0x0200020202000200
,
0x0200020202000202
,
0x0200020202010101
,
0x0200020202020000
,
0x0200020202020002
,
0x0200020202020200
,
0x0200020202020202
,
0x0201000000000101
,
0x0201000000010001
,
0x0201000000010102
,
0x0201000000010200
,
0x0201000000010201
,
0x0201000000020101
,
0x0201000001000001
,
0x0201000001000102
,
0x0201000001000201
,
0x0201000001010101
,
0x0201000001010200
,
0x0201000001010202
,
0x0201000001020201
,
0x0201000001020202
,
0x0201000002000101
,
0x0201000002010001
,
0x0201000002010100
,
0x0201000002010102
,
0x0201000002010201
,
0x0201000002020101
,
0x0201000100000001
,
0x0201000100000100
,
0x0201000100000102
,
0x0201000100000201
,
0x0201000100010000
,
0x0201000100010101
,
0x0201000100010200
,
0x0201000100010202
,
0x0201000100020001
,
0x0201000100020100
,
0x0201000100020102
,
0x0201000100020201
,
0x0201000101000000
,
0x0201000101000101
,
0x0201000101010000
,
0x0201000101010001
,
0x0201000101010100
,
0x0201000101010101
,
0x0201000101010102
,
0x0201000101010201
,
0x0201000101020002
,
0x0201000101020101
,
0x0201000102000100
,
0x0201000102000102
,
0x0201000102010002
,
0x0201000102010101
,
0x0201000102010200
,
0x0201000102020001
,
0x0201000102020100
,
0x0201000102020102
,
0x0201000102020201
,
0x0201000200000101
,
0x0201000200010001
,
0x0201000200010100
,
0x0201000200010201
,
0x0201000200020101
,
0x0201000201000100
,
0x0201000201000102
,
0x0201000201000201
,
0x0201000201010000
,
0x0201000201010002
,
0x0201000201010101
,
0x0201000201010200
,
0x0201000201020102
,
0x0201000201020201
,
0x0201000202000101
,
0x0201000202010100
,
0x0201000202010102
,
0x0201000202020201
,
0x0201010000000001
,
0x0201010000000100
,
0x0201010000000102
,
0x0201010000010000
,
0x0201010000010101
,
0x0201010000010200
,
0x0201010000020102
,
0x0201010001000000
,
0x0201010001000202
,
0x0201010001010001
,
0x0201010001010100
,
0x0201010001010101
,
0x0201010001010102
,
0x0201010001010200
,
0x0201010001010201
,
0x0201010001020000
,
0x0201010001020001
,
0x0201010001020002
,
0x0201010001020101
,
0x0201010002000100
,
0x0201010002000102
,
0x0201010002010002
,
0x0201010002010100
,
0x0201010002010101
,
0x0201010002010200
,
0x0201010002020001
,
0x0201010002020201
,
0x0201010100000000
,
0x0201010100000101
,
0x0201010100000200
,
0x0201010100000202
,
0x0201010100010000
,
0x0201010100010001
,
0x0201010100010100
,
0x0201010100010101
,
0x0201010100010102
,
0x0201010100010201
,
0x0201010100020001
,
0x0201010100020101
,
0x0201010100020201
,
0x0201010100020202
,
0x0201010101000001
,
0x0201010101000100
,
0x0201010101000101
,
0x0201010101000102
,
0x0201010101000201
,
0x0201010101010000
,
0x0201010101010001
,
0x0201010101010002
,
0x0201010101010100
,
0x0201010101010101
,
0x0201010101010102
,
0x0201010101010200
,
0x0201010101010201
,
0x0201010101010202
,
0x0201010101020001
,
0x0201010101020100
,
0x0201010101020101
,
0x0201010101020102
,
0x0201010101020201
,
0x0201010102000001
,
0x0201010102000101
,
0x0201010102000200
,
0x0201010102010001
,
0x0201010102010002
,
0x0201010102010100
,
0x0201010102010101
,
0x0201010102010102
,
0x0201010102010201
,
0x0201010102010202
,
0x0201010102020000
,
0x0201010102020002
,
0x0201010102020101
,
0x0201010102020200
,
0x0201010102020202
,
0x0201010200000001
,
0x0201010200000100
,
0x0201010200010000
,
0x0201010200010101
,
0x0201010200010201
,
0x0201010200020000
,
0x0201010200020102
,
0x0201010200020201
,
0x0201010201000101
,
0x0201010201000200
,
0x0201010201000201
,
0x0201010201010001
,
0x0201010201010002
,
0x0201010201010101
,
0x0201010201010102
,
0x0201010201010201
,
0x0201010201020101
,
0x0201010201020200
,
0x0201010202000002
,
0x0201010202000100
,
0x0201010202000201
,
0x0201010202000202
,
0x0201010202010002
,
0x0201010202010100
,
0x0201010202010101
,
0x0201010202020100
,
0x0201010202020102
,
0x0201010202020201
,
0x0201020000000101
,
0x0201020000010102
,
0x0201020000010201
,
0x0201020000020101
,
0x0201020001000001
,
0x0201020001000102
,
0x0201020001010000
,
0x0201020001010002
,
0x0201020001010101
,
0x0201020001010102
,
0x0201020001010202
,
0x0201020001020100
,
0x0201020001020101
,
0x0201020002000101
,
0x0201020002010001
,
0x0201020002010102
,
0x0201020002010201
,
0x0201020002020101
,
0x0201020100000100
,
0x0201020100000102
,
0x0201020100000201
,
0x0201020100010000
,
0x0201020100010002
,
0x0201020100010101
,
0x0201020100010200
,
0x0201020100010202
,
0x0201020100020000
,
0x0201020100020001
,
0x0201020100020100
,
0x0201020100020102
,
0x0201020101000000
,
0x0201020101000002
,
0x0201020101000101
,
0x0201020101000200
,
0x0201020101000202
,
0x0201020101010001
,
0x0201020101010100
,
0x0201020101010101
,
0x0201020101010102
,
0x0201020101010201
,
0x0201020101020002
,
0x0201020101020101
,
0x0201020101020102
,
0x0201020101020202
,
0x0201020102000001
,
0x0201020102000100
,
0x0201020102010000
,
0x0201020102010002
,
0x0201020102010101
,
0x0201020102010202
,
0x0201020102020001
,
0x0201020102020102
,
0x0201020200000101
,
0x0201020200010101
,
0x0201020200020101
,
0x0201020201000100
,
0x0201020201000102
,
0x0201020201000201
,
0x0201020201010000
,
0x0201020201010101
,
0x0201020201010200
,
0x0201020201020001
,
0x0201020202000101
,
0x0201020202010001
,
0x0201020202010100
,
0x0201020202010101
,
0x0201020202010102
,
0x0202000000000000
,
0x0202000000000002
,
0x0202000000000200
,
0x0202000000000202
,
0x0202000000010101
,
0x0202000000020000
,
0x0202000000020002
,
0x0202000000020200
,
0x0202000000020202
,
0x0202000001000101
,
0x0202000001010001
,
0x0202000001010100
,
0x0202000001010102
,
0x0202000001010201
,
0x0202000002000000
,
0x0202000002000002
,
0x0202000002000200
,
0x0202000002000202
,
0x0202000002010101
,
0x0202000002020000
,
0x0202000002020002
,
0x0202000002020200
,
0x0202000002020202
,
0x0202000100000101
,
0x0202000100000201
,
0x0202000100010001
,
0x0202000100010100
,
0x0202000100010102
,
0x0202000100010201
,
0x0202000100010202
,
0x0202000101000102
,
0x0202000101000201
,
0x0202000101010001
,
0x0202000101010101
,
0x0202000101010200
,
0x0202000101010202
,
0x0202000101020001
,
0x0202000101020100
,
0x0202000102000101
,
0x0202000102010000
,
0x0202000102010002
,
0x0202000102010102
,
0x0202000102010201
,
0x0202000200000002
,
0x0202000200000200
,
0x0202000200000202
,
0x0202000200010000
,
0x0202000200010201
,
0x0202000200020002
,
0x0202000200020200
,
0x0202000200020202
,
0x0202000201000101
,
0x0202000201010001
,
0x0202000201010102
,
0x0202000201010201
,
0x0202000201020101
,
0x0202000202000000
,
0x0202000202000002
,
0x0202000202000200
,
0x0202000202000202
,
0x0202000202010101
,
0x0202000202020000
,
0x0202000202020002
,
0x0202000202020200
,
0x0202000202020202
,
0x0202010000010201
,
0x0202010000020101
,
0x0202010001000001
,
0x0202010001000100
,
0x0202010001010000
,
0x0202010001010100
,
0x0202010001010101
,
0x0202010001010200
,
0x0202010001010202
,
0x0202010001020001
,
0x0202010001020101
,
0x0202010001020102
,
0x0202010001020200
,
0x0202010001020201
,
0x0202010002000101
,
0x0202010100000102
,
0x0202010100000201
,
0x0202010100010000
,
0x0202010100010002
,
0x0202010100010101
,
0x0202010100010200
,
0x0202010100020102
,
0x0202010100020201
,
0x0202010101000002
,
0x0202010101000101
,
0x0202010101010001
,
0x0202010101010100
,
0x0202010101010101
,
0x0202010101010102
,
0x0202010101010201
,
0x0202010101020101
,
0x0202010101020202
,
0x0202010102000001
,
0x0202010102000100
,
0x0202010102000101
,
0x0202010102000102
,
0x0202010102000201
,
0x0202010102010002
,
0x0202010102010101
,
0x0202010102010200
,
0x0202010200000101
,
0x0202010200010001
,
0x0202010200010102
,
0x0202010200010202
,
0x0202010200020001
,
0x0202010200020101
,
0x0202010201000100
,
0x0202010201000102
,
0x0202010201000202
,
0x0202010201010002
,
0x0202010201010101
,
0x0202010201010102
,
0x0202010201010200
,
0x0202010201020000
,
0x0202010201020002
,
0x0202010202000102
,
0x0202010202010000
,
0x0202010202010101
,
0x0202010202010102
,
0x0202010202010201
,
0x0202010202020001
,
0x0202010202020100
,
0x0202010202020102
,
0x0202020000000000
,
0x0202020000000002
,
0x0202020000000200
,
0x0202020000000202
,
0x0202020000020000
,
0x0202020000020002
,
0x0202020000020200
,
0x0202020000020202
,
0x0202020001010001
,
0x0202020001010100
,
0x0202020001010102
,
0x0202020001010201
,
0x0202020002000000
,
0x0202020002000002
,
0x0202020002000200
,
0x0202020002000202
,
0x0202020002010101
,
0x0202020002020000
,
0x0202020002020002
,
0x0202020002020200
,
0x0202020002020202
,
0x0202020100000101
,
0x0202020100010100
,
0x0202020100010201
,
0x0202020100020001
,
0x0202020100020101
,
0x0202020101000001
,
0x0202020101010000
,
0x0202020101010101
,
0x0202020101010202
,
0x0202020101020001
,
0x0202020101020102
,
0x0202020101020201
,
0x0202020102010000
,
0x0202020102010102
,
0x0202020200000000
,
0x0202020200000002
,
0x0202020200000200
,
0x0202020200000202
,
0x0202020200020000
,
0x0202020200020002
,
0x0202020200020200
,
0x0202020200020202
,
0x0202020201010001
,
0x0202020201010100
,
0x0202020201010102
,
0x0202020202000000
,
0x0202020202000002
,
0x0202020202000200
,
0x0202020202000202
,
0x0202020202010101
,
0x0202020202020000
,
0x0202020202020002
,
0x0202020202020200
,
0x0202020202020202
,
};
#else
static
const
uint32_t
iq1s_grid_us
[
2048
]
=
{
0x00000000
,
0x00000002
,
0x00000101
,
0x00000200
,
0x00000202
,
0x00010001
,
0x00010101
,
0x00020000
,
0x00020002
,
0x00020200
,
0x00020202
,
0x01000101
,
0x01010001
,
0x01010100
,
0x01010102
,
0x01020101
,
0x02000000
,
0x02000002
,
0x02000200
,
0x02000202
,
0x02010101
,
0x02020000
,
0x02020002
,
0x02020200
,
0x02020202
,
0x00000110
,
0x00000111
,
0x00010011
,
0x00010110
,
0x00010112
,
0x00010211
,
0x00010212
,
0x00020111
,
0x01000011
,
0x01000112
,
0x01000211
,
0x01010012
,
0x01010111
,
0x01010212
,
0x01020011
,
0x01020110
,
0x01020112
,
0x01020210
,
0x02000111
,
0x02010011
,
0x02010110
,
0x02010112
,
0x02020111
,
0x00000020
,
0x00000022
,
0x00000220
,
0x00000222
,
0x00010121
,
0x00020020
,
0x00020022
,
0x00020220
,
0x00020222
,
0x01000121
,
0x01010021
,
0x01010221
,
0x01020120
,
0x01020221
,
0x02000020
,
0x02000022
,
0x02000220
,
0x02000222
,
0x02010021
,
0x02010121
,
0x02010221
,
0x02020020
,
0x02020022
,
0x02020220
,
0x02020222
,
0x00011001
,
0x00011100
,
0x00011102
,
0x00021101
,
0x01001001
,
0x01001201
,
0x01011101
,
0x01011202
,
0x01021100
,
0x01021101
,
0x02011001
,
0x02011201
,
0x02021101
,
0x00001011
,
0x00001110
,
0x00001111
,
0x00001112
,
0x00011111
,
0x00011210
,
0x00011212
,
0x00021211
,
0x01001010
,
0x01001111
,
0x01001212
,
0x01011010
,
0x01011011
,
0x01011110
,
0x01011111
,
0x01011112
,
0x01011211
,
0x01021010
,
0x01021012
,
0x01021111
,
0x01021210
,
0x01021212
,
0x02001011
,
0x02011011
,
0x02011111
,
0x02011210
,
0x02011212
,
0x02021011
,
0x02021110
,
0x02021111
,
0x02021112
,
0x02021211
,
0x00011120
,
0x00011221
,
0x01001021
,
0x01001120
,
0x01011020
,
0x01011022
,
0x01011121
,
0x01011220
,
0x01021020
,
0x01021021
,
0x01021122
,
0x01021221
,
0x02001121
,
0x02011021
,
0x02011120
,
0x02011221
,
0x00002000
,
0x00002002
,
0x00002200
,
0x00002202
,
0x00012101
,
0x00022000
,
0x00022002
,
0x00022200
,
0x00022202
,
0x01002101
,
0x01012001
,
0x01012102
,
0x01022101
,
0x02002000
,
0x02002002
,
0x02002200
,
0x02002202
,
0x02012101
,
0x02022000
,
0x02022002
,
0x02022200
,
0x02022202
,
0x00002111
,
0x00012011
,
0x00012110
,
0x00012211
,
0x00022110
,
0x00022111
,
0x01002011
,
0x01012010
,
0x01012011
,
0x01012111
,
0x01022011
,
0x01022110
,
0x01022211
,
0x02012011
,
0x02012110
,
0x02012112
,
0x02012211
,
0x02022111
,
0x00002020
,
0x00002022
,
0x00002220
,
0x00002222
,
0x00012121
,
0x00022020
,
0x00022022
,
0x00022220
,
0x00022222
,
0x01002121
,
0x01012021
,
0x01012221
,
0x01022021
,
0x01022121
,
0x02002020
,
0x02002022
,
0x02002121
,
0x02002220
,
0x02002222
,
0x02012121
,
0x02022020
,
0x02022022
,
0x02022220
,
0x02022222
,
0x00110000
,
0x00110001
,
0x00110100
,
0x00110201
,
0x00120100
,
0x00120101
,
0x01100001
,
0x01100100
,
0x01110000
,
0x01110101
,
0x01110200
,
0x01120001
,
0x01120100
,
0x01120101
,
0x01120201
,
0x02110001
,
0x02110100
,
0x02110102
,
0x02120001
,
0x02120101
,
0x00100011
,
0x00100110
,
0x00100112
,
0x00100211
,
0x00110010
,
0x00110012
,
0x00110111
,
0x00110210
,
0x00120011
,
0x00120110
,
0x00120211
,
0x01100111
,
0x01100212
,
0x01110010
,
0x01110011
,
0x01110012
,
0x01110110
,
0x01110111
,
0x01110112
,
0x01110211
,
0x01120010
,
0x01120111
,
0x02100110
,
0x02110012
,
0x02110111
,
0x02120011
,
0x02120110
,
0x00110021
,
0x00110120
,
0x00110122
,
0x00120121
,
0x01100020
,
0x01100122
,
0x01100221
,
0x01110022
,
0x01110121
,
0x01110220
,
0x01110222
,
0x01120120
,
0x01120122
,
0x02100121
,
0x02110021
,
0x02110120
,
0x02110122
,
0x02120121
,
0x00101001
,
0x00101102
,
0x00101201
,
0x00111100
,
0x00111101
,
0x00111200
,
0x00111201
,
0x00121001
,
0x00121102
,
0x01101001
,
0x01101101
,
0x01101102
,
0x01101200
,
0x01101202
,
0x01111001
,
0x01111100
,
0x01111101
,
0x01111102
,
0x01111201
,
0x01121002
,
0x01121101
,
0x01121200
,
0x02101100
,
0x02101201
,
0x02111000
,
0x02111100
,
0x02111101
,
0x02111200
,
0x02111201
,
0x02111202
,
0x02121001
,
0x02121100
,
0x02121101
,
0x02121201
,
0x00101012
,
0x00101111
,
0x00101212
,
0x00111011
,
0x00111110
,
0x00111111
,
0x00111112
,
0x00111211
,
0x00121010
,
0x00121012
,
0x00121111
,
0x00121210
,
0x00121212
,
0x01101011
,
0x01101110
,
0x01101111
,
0x01101112
,
0x01111011
,
0x01111012
,
0x01111110
,
0x01111111
,
0x01111112
,
0x01111211
,
0x01111212
,
0x01121011
,
0x01121110
,
0x01121111
,
0x01121112
,
0x01121211
,
0x02101010
,
0x02101012
,
0x02101110
,
0x02101111
,
0x02101210
,
0x02101212
,
0x02111010
,
0x02111011
,
0x02111110
,
0x02111111
,
0x02111112
,
0x02111211
,
0x02111212
,
0x02121010
,
0x02121012
,
0x02121111
,
0x00101021
,
0x00101120
,
0x00101121
,
0x00101122
,
0x00111121
,
0x00111122
,
0x00111220
,
0x00111222
,
0x00121021
,
0x00121122
,
0x01101020
,
0x01101022
,
0x01101120
,
0x01101121
,
0x01101220
,
0x01101222
,
0x01111021
,
0x01111121
,
0x01111122
,
0x01111220
,
0x01111221
,
0x01121021
,
0x01121120
,
0x01121121
,
0x01121220
,
0x01121221
,
0x01121222
,
0x02101122
,
0x02101222
,
0x02111022
,
0x02111121
,
0x02121120
,
0x02121221
,
0x00112001
,
0x00112102
,
0x00122101
,
0x01102001
,
0x01102100
,
0x01102102
,
0x01102201
,
0x01112000
,
0x01112101
,
0x01112200
,
0x01112202
,
0x01122000
,
0x01122001
,
0x01122100
,
0x01122102
,
0x01122201
,
0x02102101
,
0x02112001
,
0x02112100
,
0x02122101
,
0x00112010
,
0x00112012
,
0x00112111
,
0x00112212
,
0x00122011
,
0x00122111
,
0x01102012
,
0x01102110
,
0x01102111
,
0x01102210
,
0x01112011
,
0x01112110
,
0x01112111
,
0x01112112
,
0x01112211
,
0x01112212
,
0x01122010
,
0x01122111
,
0x01122212
,
0x02102211
,
0x02112011
,
0x02112012
,
0x02112111
,
0x02112210
,
0x02122011
,
0x02122112
,
0x02122211
,
0x00102221
,
0x00112122
,
0x00122120
,
0x00122122
,
0x01102120
,
0x01102122
,
0x01102221
,
0x01112020
,
0x01112022
,
0x01112121
,
0x01112220
,
0x01122021
,
0x01122122
,
0x01122221
,
0x02102121
,
0x02112021
,
0x02112122
,
0x02112222
,
0x00200000
,
0x00200002
,
0x00200200
,
0x00200202
,
0x00210101
,
0x00220000
,
0x00220002
,
0x00220101
,
0x00220200
,
0x00220202
,
0x01200101
,
0x01210001
,
0x01210201
,
0x01220001
,
0x01220101
,
0x02200000
,
0x02200002
,
0x02200200
,
0x02200202
,
0x02210101
,
0x02220000
,
0x02220002
,
0x02220101
,
0x02220200
,
0x02220202
,
0x00200111
,
0x00210011
,
0x00210110
,
0x00210211
,
0x00220111
,
0x01200012
,
0x01200110
,
0x01200211
,
0x01210111
,
0x01210210
,
0x01210212
,
0x01220011
,
0x01220110
,
0x01220111
,
0x01220112
,
0x02200111
,
0x02210010
,
0x02210112
,
0x02210211
,
0x02220111
,
0x00200021
,
0x00200220
,
0x00200222
,
0x00210021
,
0x00210121
,
0x00220020
,
0x00220022
,
0x00220220
,
0x00220222
,
0x01200121
,
0x01210021
,
0x01210122
,
0x01210221
,
0x01220121
,
0x02200021
,
0x02200220
,
0x02200222
,
0x02210021
,
0x02210121
,
0x02220020
,
0x02220022
,
0x02220220
,
0x02220222
,
0x00201101
,
0x00211100
,
0x00211102
,
0x00211201
,
0x00221101
,
0x01201100
,
0x01201101
,
0x01201102
,
0x01201201
,
0x01211002
,
0x01211101
,
0x01211200
,
0x01211202
,
0x01221102
,
0x02201101
,
0x02211001
,
0x02211100
,
0x02211201
,
0x02221001
,
0x02221101
,
0x00201211
,
0x00211111
,
0x00221011
,
0x00221211
,
0x01201010
,
0x01201111
,
0x01201210
,
0x01211011
,
0x01211110
,
0x01211111
,
0x01211211
,
0x01221012
,
0x01221111
,
0x01221210
,
0x02201211
,
0x02211010
,
0x02211110
,
0x02211111
,
0x02211210
,
0x02211212
,
0x02221011
,
0x02221110
,
0x02221112
,
0x02221211
,
0x00201121
,
0x00211020
,
0x00211022
,
0x00211221
,
0x00221121
,
0x01201021
,
0x01201221
,
0x01211121
,
0x01221020
,
0x01221021
,
0x01221221
,
0x02201120
,
0x02201122
,
0x02211020
,
0x02211222
,
0x00202000
,
0x00202002
,
0x00202200
,
0x00202202
,
0x00212101
,
0x00222000
,
0x00222002
,
0x00222200
,
0x00222202
,
0x01202101
,
0x01212001
,
0x01212100
,
0x01222101
,
0x02202000
,
0x02202002
,
0x02202200
,
0x02202202
,
0x02222000
,
0x02222002
,
0x02222200
,
0x02222202
,
0x00202211
,
0x00212011
,
0x00212110
,
0x00212211
,
0x00222111
,
0x01202112
,
0x01202211
,
0x01212012
,
0x01212111
,
0x01222011
,
0x01222110
,
0x01222112
,
0x01222211
,
0x02202111
,
0x02212010
,
0x02212112
,
0x02212211
,
0x02222110
,
0x02222111
,
0x00202020
,
0x00202022
,
0x00202220
,
0x00202222
,
0x00222020
,
0x00222022
,
0x00222220
,
0x00222222
,
0x01202121
,
0x01212021
,
0x01212122
,
0x01212221
,
0x01222121
,
0x02202020
,
0x02202022
,
0x02202220
,
0x02202222
,
0x02212121
,
0x02222020
,
0x02222022
,
0x02222220
,
0x02222222
,
0x10000101
,
0x10010001
,
0x10010102
,
0x10020101
,
0x11000201
,
0x11010002
,
0x11010101
,
0x11010200
,
0x11010202
,
0x11020001
,
0x11020100
,
0x11020102
,
0x12010100
,
0x12010201
,
0x12020001
,
0x12020102
,
0x10000010
,
0x10000011
,
0x10000110
,
0x10000112
,
0x10000211
,
0x10010012
,
0x10010111
,
0x10010112
,
0x10010210
,
0x10010212
,
0x10020011
,
0x10020112
,
0x10020211
,
0x11000111
,
0x11000210
,
0x11000212
,
0x11010011
,
0x11010110
,
0x11010111
,
0x11010112
,
0x11010211
,
0x11010212
,
0x11020111
,
0x11020210
,
0x11020212
,
0x12000011
,
0x12000110
,
0x12000112
,
0x12010010
,
0x12010012
,
0x12010111
,
0x12020010
,
0x12020011
,
0x12020012
,
0x10000121
,
0x10010021
,
0x10010120
,
0x10010122
,
0x10020121
,
0x11000021
,
0x11010022
,
0x11010121
,
0x11010222
,
0x11020120
,
0x11020221
,
0x12000221
,
0x12010120
,
0x12020121
,
0x10001001
,
0x10011101
,
0x10011201
,
0x10021201
,
0x11001101
,
0x11001200
,
0x11001202
,
0x11011001
,
0x11011100
,
0x11011101
,
0x11011102
,
0x11021001
,
0x11021002
,
0x11021101
,
0x11021200
,
0x11021202
,
0x12001001
,
0x12001102
,
0x12001201
,
0x12011000
,
0x12011002
,
0x12011101
,
0x12021000
,
0x12021001
,
0x12021201
,
0x10001011
,
0x10001012
,
0x10001111
,
0x10001212
,
0x10011011
,
0x10011110
,
0x10011111
,
0x10011112
,
0x10011211
,
0x10021010
,
0x10021111
,
0x10021212
,
0x11001011
,
0x11001110
,
0x11001111
,
0x11001112
,
0x11001211
,
0x11011010
,
0x11011011
,
0x11011110
,
0x11011111
,
0x11011112
,
0x11011210
,
0x11011211
,
0x11021011
,
0x11021110
,
0x11021111
,
0x11021112
,
0x11021211
,
0x12001012
,
0x12001110
,
0x12001111
,
0x12001210
,
0x12011011
,
0x12011110
,
0x12011111
,
0x12011112
,
0x12011211
,
0x12011212
,
0x12021111
,
0x12021210
,
0x12021212
,
0x10001021
,
0x10001121
,
0x10001221
,
0x10011120
,
0x10011121
,
0x10011220
,
0x10011222
,
0x10021021
,
0x10021120
,
0x10021221
,
0x11001020
,
0x11001022
,
0x11001121
,
0x11001220
,
0x11011020
,
0x11011021
,
0x11011022
,
0x11011121
,
0x11011122
,
0x11011221
,
0x11021022
,
0x11021121
,
0x11021220
,
0x12001021
,
0x12001121
,
0x12001222
,
0x12011120
,
0x12011121
,
0x12021021
,
0x12021120
,
0x12021122
,
0x10002101
,
0x10012001
,
0x10012101
,
0x10012202
,
0x10022101
,
0x11002002
,
0x11002201
,
0x11012000
,
0x11012101
,
0x11012200
,
0x11022001
,
0x11022100
,
0x11022102
,
0x11022201
,
0x12002101
,
0x12012001
,
0x12012100
,
0x12012102
,
0x12012201
,
0x12022101
,
0x10002011
,
0x10002111
,
0x10002112
,
0x10002212
,
0x10012010
,
0x10012110
,
0x10012111
,
0x10012210
,
0x10022011
,
0x10022110
,
0x10022112
,
0x11002010
,
0x11002111
,
0x11002212
,
0x11012011
,
0x11012012
,
0x11012110
,
0x11012111
,
0x11012112
,
0x11012211
,
0x11022010
,
0x11022012
,
0x11022111
,
0x11022112
,
0x11022212
,
0x12002112
,
0x12002211
,
0x12012012
,
0x12012111
,
0x12012112
,
0x12012210
,
0x12022011
,
0x12022110
,
0x12022112
,
0x12022211
,
0x10012122
,
0x11002120
,
0x11002122
,
0x11002221
,
0x11012121
,
0x11012220
,
0x11012222
,
0x11022120
,
0x11022221
,
0x12012120
,
0x12022121
,
0x10100001
,
0x10100100
,
0x10100101
,
0x10100102
,
0x10100201
,
0x10110002
,
0x10110101
,
0x10110202
,
0x10120001
,
0x10120100
,
0x10120201
,
0x11100000
,
0x11100101
,
0x11100200
,
0x11110001
,
0x11110100
,
0x11110101
,
0x11110102
,
0x11110201
,
0x11120101
,
0x11120200
,
0x12100102
,
0x12100201
,
0x12110101
,
0x12110200
,
0x12120000
,
0x12120001
,
0x12120102
,
0x12120201
,
0x10100111
,
0x10100210
,
0x10100211
,
0x10100212
,
0x10110011
,
0x10110110
,
0x10110111
,
0x10110112
,
0x10110210
,
0x10110211
,
0x10120010
,
0x10120111
,
0x10120112
,
0x10120210
,
0x10120212
,
0x11100011
,
0x11100110
,
0x11100111
,
0x11100112
,
0x11100211
,
0x11110010
,
0x11110011
,
0x11110012
,
0x11110110
,
0x11110111
,
0x11110112
,
0x11110210
,
0x11110211
,
0x11110212
,
0x11120011
,
0x11120110
,
0x11120111
,
0x11120112
,
0x11120211
,
0x12100012
,
0x12100111
,
0x12110011
,
0x12110110
,
0x12110111
,
0x12110112
,
0x12110211
,
0x12120010
,
0x12120111
,
0x12120212
,
0x10100021
,
0x10100122
,
0x10110022
,
0x10110121
,
0x10110222
,
0x10120021
,
0x10120120
,
0x11100022
,
0x11100121
,
0x11100222
,
0x11110021
,
0x11110120
,
0x11110121
,
0x11110122
,
0x11110221
,
0x11120022
,
0x11120121
,
0x12100121
,
0x12110020
,
0x12110022
,
0x12110121
,
0x12110221
,
0x12110222
,
0x12120120
,
0x10101100
,
0x10101101
,
0x10111001
,
0x10111100
,
0x10111101
,
0x10111102
,
0x10111200
,
0x10111201
,
0x10121001
,
0x10121101
,
0x10121200
,
0x10121202
,
0x11101001
,
0x11101100
,
0x11101101
,
0x11101102
,
0x11101201
,
0x11101202
,
0x11111000
,
0x11111001
,
0x11111100
,
0x11111101
,
0x11111102
,
0x11111200
,
0x11111201
,
0x11111202
,
0x11121001
,
0x11121002
,
0x11121100
,
0x11121101
,
0x11121102
,
0x11121201
,
0x12101000
,
0x12101200
,
0x12101202
,
0x12111001
,
0x12111100
,
0x12111101
,
0x12111102
,
0x12111201
,
0x12121001
,
0x12121100
,
0x12121101
,
0x12121202
,
0x10101011
,
0x10101012
,
0x10101110
,
0x10101111
,
0x10101112
,
0x10101211
,
0x10111010
,
0x10111011
,
0x10111012
,
0x10111110
,
0x10111111
,
0x10111112
,
0x10111211
,
0x10111212
,
0x10121011
,
0x10121110
,
0x10121111
,
0x10121112
,
0x10121211
,
0x11101010
,
0x11101011
,
0x11101012
,
0x11101110
,
0x11101111
,
0x11101112
,
0x11101210
,
0x11101211
,
0x11111010
,
0x11111011
,
0x11111012
,
0x11111110
,
0x11111111
,
0x11111112
,
0x11111210
,
0x11111211
,
0x11111212
,
0x11121010
,
0x11121011
,
0x11121110
,
0x11121111
,
0x11121112
,
0x11121210
,
0x11121211
,
0x11121212
,
0x12101011
,
0x12101110
,
0x12101111
,
0x12101211
,
0x12101212
,
0x12111010
,
0x12111011
,
0x12111110
,
0x12111111
,
0x12111112
,
0x12111210
,
0x12111211
,
0x12121011
,
0x12121110
,
0x12121111
,
0x12121112
,
0x12121211
,
0x10101020
,
0x10101021
,
0x10101022
,
0x10101120
,
0x10101122
,
0x10101220
,
0x10101221
,
0x10111021
,
0x10111120
,
0x10111121
,
0x10111220
,
0x10111221
,
0x10121020
,
0x10121021
,
0x10121022
,
0x10121120
,
0x10121121
,
0x10121122
,
0x10121220
,
0x10121221
,
0x11101021
,
0x11101121
,
0x11101122
,
0x11101220
,
0x11101221
,
0x11101222
,
0x11111020
,
0x11111021
,
0x11111022
,
0x11111120
,
0x11111121
,
0x11111122
,
0x11111220
,
0x11111221
,
0x11111222
,
0x11121021
,
0x11121120
,
0x11121121
,
0x11121221
,
0x12101022
,
0x12101121
,
0x12101122
,
0x12101220
,
0x12101221
,
0x12101222
,
0x12111021
,
0x12111121
,
0x12111222
,
0x12121022
,
0x12121121
,
0x12121122
,
0x12121220
,
0x12121221
,
0x10102100
,
0x10102101
,
0x10102102
,
0x10102201
,
0x10112000
,
0x10112101
,
0x10112200
,
0x10122001
,
0x10122202
,
0x11102101
,
0x11102200
,
0x11102202
,
0x11112001
,
0x11112100
,
0x11112101
,
0x11112102
,
0x11112200
,
0x11112201
,
0x11122000
,
0x11122002
,
0x11122100
,
0x11122101
,
0x12102002
,
0x12102201
,
0x12112000
,
0x12112002
,
0x12112101
,
0x12112200
,
0x12122001
,
0x12122201
,
0x10102011
,
0x10102012
,
0x10102111
,
0x10102212
,
0x10112011
,
0x10112110
,
0x10112111
,
0x10112112
,
0x10112211
,
0x10122111
,
0x11102011
,
0x11102110
,
0x11102111
,
0x11102112
,
0x11102211
,
0x11112010
,
0x11112011
,
0x11112012
,
0x11112110
,
0x11112111
,
0x11112112
,
0x11112210
,
0x11112211
,
0x11112212
,
0x11122011
,
0x11122110
,
0x11122111
,
0x11122112
,
0x11122211
,
0x12102011
,
0x12102111
,
0x12102211
,
0x12112011
,
0x12112110
,
0x12112111
,
0x12112112
,
0x12112210
,
0x12112211
,
0x12122111
,
0x10102120
,
0x10102220
,
0x10112121
,
0x10112222
,
0x10122020
,
0x10122121
,
0x10122122
,
0x10122221
,
0x11102121
,
0x11102220
,
0x11102221
,
0x11112021
,
0x11112121
,
0x11112122
,
0x11112220
,
0x11112221
,
0x11122022
,
0x11122121
,
0x11122220
,
0x11122222
,
0x12102021
,
0x12102222
,
0x12112022
,
0x12112121
,
0x12112122
,
0x12112220
,
0x12112222
,
0x12122021
,
0x10200101
,
0x10210100
,
0x10210102
,
0x10210201
,
0x10220101
,
0x11200100
,
0x11210000
,
0x11210101
,
0x11210102
,
0x11210200
,
0x11210202
,
0x11220001
,
0x11220100
,
0x11220102
,
0x11220201
,
0x12200001
,
0x12210102
,
0x12220101
,
0x10200011
,
0x10200110
,
0x10200112
,
0x10200211
,
0x10210012
,
0x10210111
,
0x10220011
,
0x10220012
,
0x10220112
,
0x10220211
,
0x11200111
,
0x11200211
,
0x11210011
,
0x11210111
,
0x11210112
,
0x11210211
,
0x11220111
,
0x11220112
,
0x11220212
,
0x12200110
,
0x12200212
,
0x12210012
,
0x12210111
,
0x12220011
,
0x12220112
,
0x12220211
,
0x10210021
,
0x10210122
,
0x10210221
,
0x11200020
,
0x11200021
,
0x11200122
,
0x11210121
,
0x11210122
,
0x11210220
,
0x11220020
,
0x12200121
,
0x12210021
,
0x12210122
,
0x12220121
,
0x10211001
,
0x10211002
,
0x10211101
,
0x10211102
,
0x10211202
,
0x10221001
,
0x10221102
,
0x10221201
,
0x11201000
,
0x11201002
,
0x11201101
,
0x11201200
,
0x11201202
,
0x11211001
,
0x11211100
,
0x11211101
,
0x11211102
,
0x11211201
,
0x11211202
,
0x11221000
,
0x11221002
,
0x11221101
,
0x12201100
,
0x12201101
,
0x12201201
,
0x12211000
,
0x12211002
,
0x12211100
,
0x12211101
,
0x12211102
,
0x12211200
,
0x12211202
,
0x12221001
,
0x12221100
,
0x12221201
,
0x10201111
,
0x10201210
,
0x10201212
,
0x10211011
,
0x10211111
,
0x10211112
,
0x10211211
,
0x11201110
,
0x11201111
,
0x11201112
,
0x11201211
,
0x11211010
,
0x11211011
,
0x11211110
,
0x11211111
,
0x11211112
,
0x11211211
,
0x11221011
,
0x11221110
,
0x11221111
,
0x11221112
,
0x11221211
,
0x12201112
,
0x12201211
,
0x12201212
,
0x12211011
,
0x12211111
,
0x12211112
,
0x12211211
,
0x12211212
,
0x12221012
,
0x12221111
,
0x12221112
,
0x12221210
,
0x10201022
,
0x10201221
,
0x10211121
,
0x10221020
,
0x10221122
,
0x10221220
,
0x10221221
,
0x11201020
,
0x11201121
,
0x11201220
,
0x11201222
,
0x11211021
,
0x11211120
,
0x11211121
,
0x11211122
,
0x11211220
,
0x11211222
,
0x11221020
,
0x11221121
,
0x11221220
,
0x12201020
,
0x12201022
,
0x12201121
,
0x12201222
,
0x12211120
,
0x12211122
,
0x12211220
,
0x12211221
,
0x12221020
,
0x12221120
,
0x12221122
,
0x12221222
,
0x10212102
,
0x10212201
,
0x10222101
,
0x11202001
,
0x11212002
,
0x11212101
,
0x11212202
,
0x11222001
,
0x11222201
,
0x12202101
,
0x12212001
,
0x12212200
,
0x12222102
,
0x10202011
,
0x10202110
,
0x10212010
,
0x10212111
,
0x10222011
,
0x10222110
,
0x10222112
,
0x10222211
,
0x11202010
,
0x11202011
,
0x11202111
,
0x11202112
,
0x11202210
,
0x11212011
,
0x11212110
,
0x11212111
,
0x11212112
,
0x11212211
,
0x11222010
,
0x11222111
,
0x11222212
,
0x12202012
,
0x12202110
,
0x12202212
,
0x12212111
,
0x12222011
,
0x12222110
,
0x12222111
,
0x12222211
,
0x10212021
,
0x10212122
,
0x10212220
,
0x11202021
,
0x11202120
,
0x11202221
,
0x11212020
,
0x11212121
,
0x11212220
,
0x11212222
,
0x11222120
,
0x11222121
,
0x11222221
,
0x12202122
,
0x12212120
,
0x12212220
,
0x12212222
,
0x12222122
,
0x20000000
,
0x20000002
,
0x20000200
,
0x20000202
,
0x20020000
,
0x20020002
,
0x20020200
,
0x20020202
,
0x21000101
,
0x21010000
,
0x21010001
,
0x21010100
,
0x21010102
,
0x21010201
,
0x21020101
,
0x22000000
,
0x22000002
,
0x22000200
,
0x22000202
,
0x22010101
,
0x22020000
,
0x22020002
,
0x22020200
,
0x22020202
,
0x20000111
,
0x20010011
,
0x20010110
,
0x20010112
,
0x20010211
,
0x20020111
,
0x21000011
,
0x21000110
,
0x21000211
,
0x21010010
,
0x21010012
,
0x21010111
,
0x21010112
,
0x21010210
,
0x21010211
,
0x21020110
,
0x21020112
,
0x21020211
,
0x22000111
,
0x22000211
,
0x22010110
,
0x22010112
,
0x22010211
,
0x22020111
,
0x20000020
,
0x20000022
,
0x20000220
,
0x20000222
,
0x20010121
,
0x20020020
,
0x20020022
,
0x20020220
,
0x20020222
,
0x21010021
,
0x21010120
,
0x21010221
,
0x21020121
,
0x22000020
,
0x22000022
,
0x22000220
,
0x22000222
,
0x22010121
,
0x22020020
,
0x22020022
,
0x22020220
,
0x22020222
,
0x20011100
,
0x20011201
,
0x21001001
,
0x21001100
,
0x21011001
,
0x21011101
,
0x21011202
,
0x21021001
,
0x21021100
,
0x21021201
,
0x22011100
,
0x22011201
,
0x20001011
,
0x20001211
,
0x20011012
,
0x20011111
,
0x20011212
,
0x20021112
,
0x20021211
,
0x21001010
,
0x21001011
,
0x21001111
,
0x21001210
,
0x21011011
,
0x21011110
,
0x21011111
,
0x21011112
,
0x21011211
,
0x21011212
,
0x21021111
,
0x21021112
,
0x21021210
,
0x21021212
,
0x22001011
,
0x22001110
,
0x22001112
,
0x22001211
,
0x22011010
,
0x22011012
,
0x22011111
,
0x22011210
,
0x22021112
,
0x20011021
,
0x20011122
,
0x20011221
,
0x20021121
,
0x21001021
,
0x21001120
,
0x21001221
,
0x21001222
,
0x21011020
,
0x21011121
,
0x21011221
,
0x21011222
,
0x21021021
,
0x21021122
,
0x21021222
,
0x22001121
,
0x22011021
,
0x22011222
,
0x22021120
,
0x20002000
,
0x20002002
,
0x20002200
,
0x20002202
,
0x20012101
,
0x20022000
,
0x20022002
,
0x20022200
,
0x20022202
,
0x21002001
,
0x21002101
,
0x21012001
,
0x21012100
,
0x21012201
,
0x21022101
,
0x21022201
,
0x22002000
,
0x22002002
,
0x22002200
,
0x22002202
,
0x22012101
,
0x22022000
,
0x22022002
,
0x22022200
,
0x22022202
,
0x20002111
,
0x20002112
,
0x20012011
,
0x20012110
,
0x20012112
,
0x20022111
,
0x21002011
,
0x21002110
,
0x21002112
,
0x21002211
,
0x21012010
,
0x21012012
,
0x21012111
,
0x21012212
,
0x21022011
,
0x21022110
,
0x22002111
,
0x22012112
,
0x22012211
,
0x22022111
,
0x20002020
,
0x20002022
,
0x20002220
,
0x20002222
,
0x20012121
,
0x20022020
,
0x20022022
,
0x20022220
,
0x20022222
,
0x21002121
,
0x21012021
,
0x21012120
,
0x21012122
,
0x22002020
,
0x22002022
,
0x22002220
,
0x22002222
,
0x22012121
,
0x22022020
,
0x22022022
,
0x22022220
,
0x22022222
,
0x20100101
,
0x20110001
,
0x20110102
,
0x20110200
,
0x20110201
,
0x20120101
,
0x21100001
,
0x21100102
,
0x21100201
,
0x21110101
,
0x21110200
,
0x21110202
,
0x21120201
,
0x21120202
,
0x22100101
,
0x22110001
,
0x22110100
,
0x22110102
,
0x22110201
,
0x22120101
,
0x20100011
,
0x20100110
,
0x20100112
,
0x20100211
,
0x20110010
,
0x20110111
,
0x20110210
,
0x20110212
,
0x20120011
,
0x20120110
,
0x20120112
,
0x20120211
,
0x21100010
,
0x21100111
,
0x21110010
,
0x21110011
,
0x21110110
,
0x21110111
,
0x21110112
,
0x21110211
,
0x21120012
,
0x21120111
,
0x22100110
,
0x22100112
,
0x22110012
,
0x22110111
,
0x22110210
,
0x22120011
,
0x22120110
,
0x22120112
,
0x22120211
,
0x20100121
,
0x20110021
,
0x20110120
,
0x20110221
,
0x20120121
,
0x21100120
,
0x21100122
,
0x21100221
,
0x21110020
,
0x21110022
,
0x21110121
,
0x21110220
,
0x21120122
,
0x21120221
,
0x22100121
,
0x22110120
,
0x22110122
,
0x22120221
,
0x20101001
,
0x20101100
,
0x20101102
,
0x20111000
,
0x20111101
,
0x20111200
,
0x20121102
,
0x21101000
,
0x21101202
,
0x21111001
,
0x21111100
,
0x21111101
,
0x21111102
,
0x21111200
,
0x21111201
,
0x21121000
,
0x21121001
,
0x21121002
,
0x21121101
,
0x22101100
,
0x22101102
,
0x22111002
,
0x22111100
,
0x22111101
,
0x22111200
,
0x22121001
,
0x22121201
,
0x20101010
,
0x20101111
,
0x20101210
,
0x20101212
,
0x20111010
,
0x20111011
,
0x20111110
,
0x20111111
,
0x20111112
,
0x20111211
,
0x20121011
,
0x20121111
,
0x20121211
,
0x20121212
,
0x21101011
,
0x21101110
,
0x21101111
,
0x21101112
,
0x21101211
,
0x21111010
,
0x21111011
,
0x21111012
,
0x21111110
,
0x21111111
,
0x21111112
,
0x21111210
,
0x21111211
,
0x21111212
,
0x21121011
,
0x21121110
,
0x21121111
,
0x21121112
,
0x21121211
,
0x22101011
,
0x22101111
,
0x22101210
,
0x22111011
,
0x22111012
,
0x22111110
,
0x22111111
,
0x22111112
,
0x22111211
,
0x22111212
,
0x22121010
,
0x22121012
,
0x22121111
,
0x22121210
,
0x22121212
,
0x20101021
,
0x20101120
,
0x20111020
,
0x20111121
,
0x20111221
,
0x20121020
,
0x20121122
,
0x20121221
,
0x21101121
,
0x21101220
,
0x21101221
,
0x21111021
,
0x21111022
,
0x21111121
,
0x21111122
,
0x21111221
,
0x21121121
,
0x21121220
,
0x22101022
,
0x22101120
,
0x22101221
,
0x22101222
,
0x22111022
,
0x22111120
,
0x22111121
,
0x22121120
,
0x22121122
,
0x22121221
,
0x20102101
,
0x20112102
,
0x20112201
,
0x20122101
,
0x21102001
,
0x21102102
,
0x21112000
,
0x21112002
,
0x21112101
,
0x21112102
,
0x21112202
,
0x21122100
,
0x21122101
,
0x22102101
,
0x22112001
,
0x22112102
,
0x22112201
,
0x22122101
,
0x20102110
,
0x20102112
,
0x20102211
,
0x20112010
,
0x20112012
,
0x20112111
,
0x20112210
,
0x20112212
,
0x20122010
,
0x20122011
,
0x20122110
,
0x20122112
,
0x21102010
,
0x21102012
,
0x21102111
,
0x21102210
,
0x21102212
,
0x21112011
,
0x21112110
,
0x21112111
,
0x21112112
,
0x21112211
,
0x21122012
,
0x21122111
,
0x21122112
,
0x21122212
,
0x22102011
,
0x22102110
,
0x22112010
,
0x22112012
,
0x22112111
,
0x22112212
,
0x22122011
,
0x22122112
,
0x20102121
,
0x20112121
,
0x20122121
,
0x21102120
,
0x21102122
,
0x21102221
,
0x21112020
,
0x21112121
,
0x21112220
,
0x21122021
,
0x22102121
,
0x22112021
,
0x22112120
,
0x22112121
,
0x22112122
,
0x20200000
,
0x20200002
,
0x20200200
,
0x20200202
,
0x20210101
,
0x20220000
,
0x20220002
,
0x20220200
,
0x20220202
,
0x21200101
,
0x21210001
,
0x21210100
,
0x21210102
,
0x21210201
,
0x22200000
,
0x22200002
,
0x22200200
,
0x22200202
,
0x22210101
,
0x22220000
,
0x22220002
,
0x22220200
,
0x22220202
,
0x20200111
,
0x20200211
,
0x20210011
,
0x20210110
,
0x20210112
,
0x20210211
,
0x20210212
,
0x21200112
,
0x21200211
,
0x21210011
,
0x21210111
,
0x21210210
,
0x21210212
,
0x21220011
,
0x21220110
,
0x22200111
,
0x22210010
,
0x22210012
,
0x22210112
,
0x22210211
,
0x20200022
,
0x20200220
,
0x20200222
,
0x20210020
,
0x20210221
,
0x20220022
,
0x20220220
,
0x20220222
,
0x21200121
,
0x21210021
,
0x21210122
,
0x21210221
,
0x21220121
,
0x22200020
,
0x22200022
,
0x22200220
,
0x22200222
,
0x22210121
,
0x22220020
,
0x22220022
,
0x22220220
,
0x22220222
,
0x20211201
,
0x20221101
,
0x21201001
,
0x21201100
,
0x21211000
,
0x21211100
,
0x21211101
,
0x21211200
,
0x21211202
,
0x21221001
,
0x21221101
,
0x21221102
,
0x21221200
,
0x21221201
,
0x22201101
,
0x20201112
,
0x20201211
,
0x20211010
,
0x20211012
,
0x20211111
,
0x20211210
,
0x20221112
,
0x20221211
,
0x21201012
,
0x21201111
,
0x21211011
,
0x21211110
,
0x21211111
,
0x21211112
,
0x21211211
,
0x21221111
,
0x21221212
,
0x22201011
,
0x22201110
,
0x22201111
,
0x22201112
,
0x22201211
,
0x22211012
,
0x22211111
,
0x22211210
,
0x20201121
,
0x20211021
,
0x20211122
,
0x20211222
,
0x20221021
,
0x20221121
,
0x21201120
,
0x21201122
,
0x21201222
,
0x21211022
,
0x21211121
,
0x21211122
,
0x21211220
,
0x21221020
,
0x21221022
,
0x22201122
,
0x22211020
,
0x22211121
,
0x22211122
,
0x22211221
,
0x22221021
,
0x22221120
,
0x22221122
,
0x20202000
,
0x20202002
,
0x20202200
,
0x20202202
,
0x20222000
,
0x20222002
,
0x20222200
,
0x20222202
,
0x21212001
,
0x21212100
,
0x21212102
,
0x21212201
,
0x22202000
,
0x22202002
,
0x22202200
,
0x22202202
,
0x22212101
,
0x22222000
,
0x22222002
,
0x22222200
,
0x22222202
,
0x20202111
,
0x20212110
,
0x20212211
,
0x20222011
,
0x20222111
,
0x21202011
,
0x21212010
,
0x21212111
,
0x21212212
,
0x21222011
,
0x21222112
,
0x21222211
,
0x22212010
,
0x22212112
,
0x20202020
,
0x20202022
,
0x20202220
,
0x20202222
,
0x20222020
,
0x20222022
,
0x20222220
,
0x20222222
,
0x21212021
,
0x21212120
,
0x21212122
,
0x22202020
,
0x22202022
,
0x22202220
,
0x22202222
,
0x22212121
,
0x22222020
,
0x22222022
,
0x22222220
,
0x22222222
,
};
#endif
#ifndef HAVE_FANCY_SIMD
const
uint64_t
keven_signs
[
128
]
=
{
const
uint64_t
keven_signs
[
128
]
=
{
0x0101010101010101
,
0xff010101010101ff
,
0xff0101010101ff01
,
0x010101010101ffff
,
0x0101010101010101
,
0xff010101010101ff
,
0xff0101010101ff01
,
0x010101010101ffff
,
0xff01010101ff0101
,
0x0101010101ff01ff
,
0x0101010101ffff01
,
0xff01010101ffffff
,
0xff01010101ff0101
,
0x0101010101ff01ff
,
0x0101010101ffff01
,
0xff01010101ffffff
,
...
@@ -181,31 +989,41 @@ const uint64_t keven_signs[128] = {
...
@@ -181,31 +989,41 @@ const uint64_t keven_signs[128] = {
0x01ffffffff010101
,
0xffffffffff0101ff
,
0xffffffffff01ff01
,
0x01ffffffff01ffff
,
0x01ffffffff010101
,
0xffffffffff0101ff
,
0xffffffffff01ff01
,
0x01ffffffff01ffff
,
0xffffffffffff0101
,
0x01ffffffffff01ff
,
0x01ffffffffffff01
,
0xffffffffffffffff
,
0xffffffffffff0101
,
0x01ffffffffff01ff
,
0x01ffffffffffff01
,
0xffffffffffffffff
,
};
};
#endif
}
}
bool
iqk_mul_mat
(
long
Nx
,
long
Ny
,
long
ne00
,
int
typeA
,
const
void
*
A
,
const
void
*
B
,
/* moonll change mulmat
float
*
C
,
long
stride_C
,
int
ith
,
int
nth
)
{
add typeB and strideB
}*/
MulMat
mm
;
bool
iqk_mul_mat
(
long
Nx
,
long
Ny
,
long
ne00
,
int
row_size_q8
;
int
typeA
,
const
void
*
A
,
long
strideA
,
if
(
!
MulMat
::
set_mul_mat
(
typeA
,
ne00
,
mm
,
row_size_q8
,
Ny
))
{
int
typeB
,
const
void
*
B
,
long
strideB
,
return
false
;
float
*
C
,
long
stride_C
,
int
ith
,
int
nth
)
{
}
auto
row_size_qx
=
ggml_row_size
((
ggml_type
)
typeA
,
ne00
);
MulMat
mm
;
if
(
!
MulMat
::
set_mul_mat
(
typeA
,
typeB
,
ne00
,
mm
,
Ny
))
{
return
false
;
}
auto
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
size_t
row_size_qx
=
strideA
*
ggml_type_size
(
ggml_type
(
typeA
));
auto
first_x
=
ith
*
nrc_x
;
size_t
row_size_qy
=
strideB
*
ggml_type_size
(
ggml_type
(
typeB
));
if
(
first_x
+
nrc_x
>
Nx
)
nrc_x
=
Nx
-
first_x
;
auto
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
auto
first_x
=
ith
*
nrc_x
;
if
(
first_x
+
nrc_x
>
Nx
)
nrc_x
=
Nx
-
first_x
;
DataInfo
info
{
C
+
first_x
,
(
const
char
*
)
B
,
(
size_t
)
stride_C
,
(
size_t
)
row_size_q
8
,
0
,
1
,
nullptr
,
0
};
DataInfo
info
{
C
+
first_x
,
(
const
char
*
)
B
,
(
size_t
)
stride_C
,
row_size_q
y
,
0
,
1
,
nullptr
,
0
};
mm
.
mul_mat_NxM
(
ne00
,
(
const
char
*
)
A
+
row_size_qx
*
first_x
,
row_size_qx
,
info
,
nrc_x
,
Ny
);
mm
.
mul_mat_NxM
(
ne00
,
(
const
char
*
)
A
+
row_size_qx
*
first_x
,
row_size_qx
,
info
,
nrc_x
,
Ny
);
return
true
;
return
true
;
}
}
bool
iqk_mul_mat_moe
(
long
Nx
,
long
Ny
,
long
ne00
,
int
ne11
,
int
typeA
,
const
void
*
A
,
const
void
*
B
,
bool
iqk_mul_mat_moe
(
long
Nx
,
long
Ny
,
long
ne00
,
int
ne11
,
int
typeA
,
const
void
*
A
,
const
void
*
B
,
float
*
C
,
long
nb1
,
long
nb2
,
const
void
*
vrow_mapping
,
int
ith
,
int
nth
)
{
float
*
C
,
long
nb1
,
long
nb2
,
const
void
*
vrow_mapping
,
int
ith
,
int
nth
)
{
const
mmid_row_mapping
*
row_mapping
=
(
const
mmid_row_mapping
*
)
vrow_mapping
;
const
mmid_row_mapping
*
row_mapping
=
(
const
mmid_row_mapping
*
)
vrow_mapping
;
...
@@ -213,9 +1031,11 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
...
@@ -213,9 +1031,11 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
MulMat
mm
;
MulMat
mm
;
int
row_size_q8
;
int
row_size_q8
;
/* moonll
if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
return false;
return false;
}
}
*/
int
row_size_qx
=
ggml_row_size
((
ggml_type
)
typeA
,
ne00
);
int
row_size_qx
=
ggml_row_size
((
ggml_type
)
typeA
,
ne00
);
int
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
int
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
int
first_x
=
ith
*
nrc_x
;
int
first_x
=
ith
*
nrc_x
;
...
@@ -233,6 +1053,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
...
@@ -233,6 +1053,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#define HAVE_FANCY_SIMD
#define HAVE_FANCY_SIMD
#endif
#endif
//#define HAVE_FANCY_SIMD
namespace
{
namespace
{
...
@@ -257,10 +1078,9 @@ template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
...
@@ -257,10 +1078,9 @@ template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
}
}
#ifdef HAVE_FANCY_SIMD
#ifdef HAVE_FANCY_SIMD
inline
__m512i
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm512_loadu_si512
((
const
__m512i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
inline
__m512i
load_quants64
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm512_loadu_si512
((
const
__m512i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
#else
inline
__m256i
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
#endif
#endif
inline
__m256i
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
inline
__m256i
load_bsums
(
int
iy
,
int
i
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
bsums
);
}
inline
__m256i
load_bsums
(
int
iy
,
int
i
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
bsums
);
}
inline
float
scale
(
int
iy
,
int
i
)
const
{
return
y
[
iy
][
i
]
.
d
;
}
inline
float
scale
(
int
iy
,
int
i
)
const
{
return
y
[
iy
][
i
]
.
d
;
}
...
@@ -353,6 +1173,23 @@ struct ScaleIQ4XS {
...
@@ -353,6 +1173,23 @@ struct ScaleIQ4XS {
const
__m128i
m32
=
_mm_set1_epi16
(
-
32
);
const
__m128i
m32
=
_mm_set1_epi16
(
-
32
);
};
};
struct
Scales8KBase
{
template
<
typename
Q8
>
inline
void
accum_mins
(
const
__m128i
&
mins128
,
const
Q8
&
q8
,
int
i
,
float
c
,
__m256
*
accd
)
const
{
const
__m256i
mins
=
MM256_SET_M128I
(
_mm_shuffle_epi8
(
mins128
,
shuffles
[
1
]),
_mm_shuffle_epi8
(
mins128
,
shuffles
[
0
]));
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
q8s
=
q8
.
load_bsums
(
iy
,
i
);
const
__m256i
prod
=
_mm256_madd_epi16
(
mins
,
q8s
);
accd
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
c
*
q8
.
scale
(
iy
,
i
)),
_mm256_cvtepi32_ps
(
prod
),
accd
[
iy
]);
}
}
inline
__m256i
shuffle
(
__m128i
mins
)
const
{
return
MM256_SET_M128I
(
_mm_shuffle_epi8
(
mins
,
shuffles
[
1
]),
_mm_shuffle_epi8
(
mins
,
shuffles
[
0
]));
}
const
__m128i
shuffles
[
2
]
=
{
_mm_set_epi32
(
0x07060706
,
0x05040504
,
0x03020302
,
0x01000100
),
_mm_set_epi32
(
0x0f0e0f0e
,
0x0d0c0d0c
,
0x0b0a0b0a
,
0x09080908
)};
};
template
<
typename
Block
>
template
<
typename
Block
>
struct
BaseDequantizer
{
struct
BaseDequantizer
{
BaseDequantizer
(
const
void
*
vx
,
size_t
bx
)
:
vx
(
vx
),
bx
(
bx
)
{}
BaseDequantizer
(
const
void
*
vx
,
size_t
bx
)
:
vx
(
vx
),
bx
(
bx
)
{}
...
@@ -367,6 +1204,16 @@ struct BaseDequantizer {
...
@@ -367,6 +1204,16 @@ struct BaseDequantizer {
float
d
;
float
d
;
};
};
__m128i
inline
load_iq4nl_values_128
()
{
static
const
uint8_t
kvalues_iq4nl
[
16
]
=
{
1
,
24
,
45
,
63
,
79
,
93
,
106
,
118
,
129
,
141
,
153
,
166
,
181
,
197
,
217
,
241
};
return
_mm_loadu_si128
((
const
__m128i
*
)
kvalues_iq4nl
);
}
__m256i
inline
load_iq4nl_values_256
()
{
auto
val128
=
load_iq4nl_values_128
();
return
MM256_SET_M128I
(
val128
,
val128
);
}
#ifdef HAVE_FANCY_SIMD
#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================
//====================================== Zen4 ==================================================
...
@@ -434,8 +1281,17 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
...
@@ -434,8 +1281,17 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
Scales8K
s8k
;
Scales8K
s8k
;
};
};
/*
moonll DequantizerIQ4XS
*/
__m512i
inline
load_iq4nl_values_512
()
{
auto
val256
=
load_iq4nl_values_256
();
return
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
val256
),
val256
,
1
);
}
struct
DequantizerIQ4XS
final
:
public
BaseDequantizer
<
block_iq4_xs
>
{
struct
DequantizerIQ4XS
final
:
public
BaseDequantizer
<
block_iq4_xs
>
{
DequantizerIQ4XS
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
),
values
(
load_values
())
{}
DequantizerIQ4XS
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
),
values
(
load_
iq4nl_
values
_512
())
{}
template
<
typename
Q8
>
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
,
__m512i
*
scales
)
{
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
,
__m512i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
...
@@ -444,14 +1300,10 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
...
@@ -444,14 +1300,10 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
s8k
.
accum_mins
(
scales128
,
q8
,
i
,
-
128.
f
*
d
,
accd
);
s8k
.
accum_mins
(
scales128
,
q8
,
i
,
-
128.
f
*
d
,
accd
);
auto
scales256
=
MM256_SET_M128I
(
scales128
,
scales128
);
auto
scales256
=
MM256_SET_M128I
(
scales128
,
scales128
);
auto
all_scales
=
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
scales256
),
scales256
,
1
);
auto
all_scales
=
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
scales256
),
scales256
,
1
);
scales
[
0
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
0
]);
scales
[
0
]
=
_mm512_shuffle_epi8
(
all_scales
,
shuffles
[
0
]);
scales
[
1
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
1
]);
scales
[
1
]
=
_mm512_shuffle_epi8
(
all_scales
,
shuffles
[
1
]);
}
scales
[
2
]
=
_mm512_shuffle_epi8
(
all_scales
,
shuffles
[
2
]);
static
__m512i
load_values
()
{
scales
[
3
]
=
_mm512_shuffle_epi8
(
all_scales
,
shuffles
[
3
]);
static
const
uint8_t
kvalues_iq4nl
[
16
]
=
{
1
,
24
,
45
,
63
,
79
,
93
,
106
,
118
,
129
,
141
,
153
,
166
,
181
,
197
,
217
,
241
};
auto
val128
=
_mm_loadu_si128
((
const
__m128i
*
)
kvalues_iq4nl
);
auto
val256
=
MM256_SET_M128I
(
val128
,
val128
);
return
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
val256
),
val256
,
1
);
}
}
inline
void
prepare
(
const
uint8_t
*
q4
)
{
inline
void
prepare
(
const
uint8_t
*
q4
)
{
bits
.
prepare64
(
q4
);
bits
.
prepare64
(
q4
);
...
@@ -467,11 +1319,17 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
...
@@ -467,11 +1319,17 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
}
}
Q4Bits
bits
;
Q4Bits
bits
;
Scales8K
s8k
;
Scales8K
Base
s8k
;
ScaleIQ4XS
siq4
;
ScaleIQ4XS
siq4
;
const
__m512i
values
;
const
__m512i
values
;
const
__m512i
permute1
=
_mm512_set_epi64
(
11
,
10
,
3
,
2
,
9
,
8
,
1
,
0
);
const
__m512i
permute1
=
_mm512_set_epi64
(
11
,
10
,
3
,
2
,
9
,
8
,
1
,
0
);
const
__m512i
permute2
=
_mm512_set_epi64
(
15
,
14
,
7
,
6
,
13
,
12
,
5
,
4
);
const
__m512i
permute2
=
_mm512_set_epi64
(
15
,
14
,
7
,
6
,
13
,
12
,
5
,
4
);
const
__m512i
shuffles
[
4
]
=
{
_mm512_inserti32x8
(
_mm512_set1_epi16
(
0x0100
),
_mm256_set1_epi16
(
0x0302
),
1
),
_mm512_inserti32x8
(
_mm512_set1_epi16
(
0x0504
),
_mm256_set1_epi16
(
0x0706
),
1
),
_mm512_inserti32x8
(
_mm512_set1_epi16
(
0x0908
),
_mm256_set1_epi16
(
0x0b0a
),
1
),
_mm512_inserti32x8
(
_mm512_set1_epi16
(
0x0d0c
),
_mm256_set1_epi16
(
0x0f0e
),
1
),
};
};
};
struct
HighBit5
{
struct
HighBit5
{
...
@@ -646,6 +1504,149 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
...
@@ -646,6 +1504,149 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
}
}
}
}
template
<
typename
Q8
>
inline
void
compute_block
(
int
iy
,
int
i
,
float
d
,
const
Q8
&
q8
,
const
__m512i
*
values
,
const
__m512i
*
scales
,
__m512
*
accd
)
{
const
__m512i
p1
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
values
[
0
],
q8
.
load_quants64
(
iy
,
i
,
0
));
const
__m512i
p2
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
values
[
1
],
q8
.
load_quants64
(
iy
,
i
,
1
));
const
__m512i
p3
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
values
[
2
],
q8
.
load_quants64
(
iy
,
i
,
2
));
const
__m512i
p4
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
values
[
3
],
q8
.
load_quants64
(
iy
,
i
,
3
));
auto
sumi
=
_mm512_dpwssd_epi32
(
_mm512_setzero_si512
(),
scales
[
0
],
_mm512_packs_epi32
(
p1
,
p2
));
sumi
=
_mm512_dpwssd_epi32
(
sumi
,
scales
[
1
],
_mm512_packs_epi32
(
p3
,
p4
));
accd
[
iy
]
=
_mm512_fmadd_ps
(
_mm512_set1_ps
(
d
*
q8
.
scale
(
iy
,
i
)),
_mm512_cvtepi32_ps
(
sumi
),
accd
[
iy
]);
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qX_K_q8_K_AVX512
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256
accm
[
nrc_y
];
__m512
accd
[
nrc_y
];
__m512i
scales
[
2
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm512_setzero_ps
();
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accm
[
iy
]
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
deq
.
new_block
(
i
,
q8
,
accm
,
scales
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
const
__m512i
p1
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
0
],
q8
.
load_quants64
(
iy
,
i
,
0
));
const
__m512i
p2
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
1
],
q8
.
load_quants64
(
iy
,
i
,
1
));
const
__m512i
p3
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
2
],
q8
.
load_quants64
(
iy
,
i
,
2
));
const
__m512i
p4
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
3
],
q8
.
load_quants64
(
iy
,
i
,
3
));
auto
sumi
=
_mm512_dpwssd_epi32
(
_mm512_setzero_si512
(),
scales
[
0
],
_mm512_packs_epi32
(
p1
,
p2
));
sumi
=
_mm512_dpwssd_epi32
(
sumi
,
scales
[
1
],
_mm512_packs_epi32
(
p3
,
p4
));
accd
[
iy
]
=
_mm512_fmadd_ps
(
_mm512_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
)),
_mm512_cvtepi32_ps
(
sumi
),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
sum256
=
_mm256_add_ps
(
_mm512_castps512_ps256
(
accd
[
iy
]),
_mm512_extractf32x8_ps
(
accd
[
iy
],
1
));
info
.
store
(
ix
,
iy
,
hsum_float_8
(
_mm256_add_ps
(
accm
[
iy
],
sum256
)));
}
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_iqX_k_q8_K_AVX512
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256
accm
[
nrc_y
];
__m512
accd
[
nrc_y
];
__m512i
scales
[
4
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm512_setzero_ps
();
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accm
[
iy
]
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
deq
.
new_block
(
i
,
q8
,
accm
,
scales
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
const
__m512i
p1
=
_mm512_maddubs_epi16
(
deq
.
bits
.
values
[
0
],
q8
.
load_quants64
(
iy
,
i
,
0
));
const
__m512i
p2
=
_mm512_maddubs_epi16
(
deq
.
bits
.
values
[
1
],
q8
.
load_quants64
(
iy
,
i
,
1
));
const
__m512i
p3
=
_mm512_maddubs_epi16
(
deq
.
bits
.
values
[
2
],
q8
.
load_quants64
(
iy
,
i
,
2
));
const
__m512i
p4
=
_mm512_maddubs_epi16
(
deq
.
bits
.
values
[
3
],
q8
.
load_quants64
(
iy
,
i
,
3
));
auto
sumi
=
_mm512_dpwssd_epi32
(
_mm512_dpwssd_epi32
(
_mm512_dpwssd_epi32
(
_mm512_dpwssd_epi32
(
_mm512_setzero_si512
(),
p1
,
scales
[
0
]),
p2
,
scales
[
1
]),
p3
,
scales
[
2
]),
p4
,
scales
[
3
]);
accd
[
iy
]
=
_mm512_fmadd_ps
(
_mm512_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
)),
_mm512_cvtepi32_ps
(
sumi
),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
sum256
=
_mm256_add_ps
(
_mm512_castps512_ps256
(
accd
[
iy
]),
_mm512_extractf32x8_ps
(
accd
[
iy
],
1
));
info
.
store
(
ix
,
iy
,
hsum_float_8
(
_mm256_add_ps
(
accm
[
iy
],
sum256
)));
}
}
}
template
<
typename
Dequantizer
>
static
void
mul_mat_qX_K_q8_K_AVX512_1
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
constexpr
int
k_nx
=
2
;
Q8
<
1
>
q8
(
info
);
Dequantizer
deq1
(
vx
,
bx
);
Dequantizer
deq2
(
vx
,
bx
);
Dequantizer
*
deq
[
k_nx
];
deq
[
0
]
=
&
deq1
;
deq
[
1
]
=
&
deq2
;
__m512i
scales
[
2
*
k_nx
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
auto
accd
=
_mm512_setzero_ps
();
auto
accm
=
_mm256_setzero_ps
();
for
(
int
kx
=
0
;
kx
<
k_nx
;
++
kx
)
deq
[
kx
]
->
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
/
k_nx
;
++
i
)
{
for
(
int
kx
=
0
;
kx
<
k_nx
;
++
kx
)
deq
[
kx
]
->
new_block
(
k_nx
*
i
+
kx
,
q8
,
&
accm
,
scales
+
2
*
kx
);
for
(
int
kx
=
0
;
kx
<
k_nx
;
++
kx
)
{
compute_block
(
0
,
k_nx
*
i
+
kx
,
deq
[
kx
]
->
d
,
q8
,
deq
[
kx
]
->
bits
.
values
,
scales
+
2
*
kx
,
&
accd
);
}
}
if
(
2
*
(
nb
/
2
)
<
nb
)
{
int
i0
=
2
*
(
nb
/
2
);
deq
[
0
]
->
new_block
(
i0
,
q8
,
&
accm
,
scales
);
compute_block
(
0
,
i0
,
deq
[
0
]
->
d
,
q8
,
deq
[
0
]
->
bits
.
values
,
scales
,
&
accd
);
}
auto
sum256
=
_mm256_add_ps
(
_mm512_castps512_ps256
(
accd
),
_mm512_extractf32x8_ps
(
accd
,
1
));
info
.
store
(
ix
,
0
,
hsum_float_8
(
_mm256_add_ps
(
accm
,
sum256
)));
}
}
#else
#else
// ===================================== Vanilla AVX2 =====================================
// ===================================== Vanilla AVX2 =====================================
...
@@ -724,17 +1725,8 @@ struct HighBit3 {
...
@@ -724,17 +1725,8 @@ struct HighBit3 {
__m256i
hbits
;
__m256i
hbits
;
};
};
inline
__m256i
get_scale_shuffle_8
(
int
i
)
{
return
_mm256_set1_epi16
((
2
*
i
)
|
((
2
*
i
+
1
)
<<
8
));
}
inline
void
set_scales_8
(
const
__m256i
&
all_scales
,
int
j
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
3
));
}
/*
template <typename Q8, typename Bits>
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
if (j == 0) {
if (j == 0) {
...
@@ -755,7 +1747,7 @@ inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i,
...
@@ -755,7 +1747,7 @@ inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i,
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
}
}
}
}
}
}
*/
struct
DequantizerQ4K
final
:
public
BaseDequantizer
<
block_q4_K
>
{
struct
DequantizerQ4K
final
:
public
BaseDequantizer
<
block_q4_K
>
{
DequantizerQ4K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
DequantizerQ4K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
...
@@ -889,22 +1881,8 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
...
@@ -889,22 +1881,8 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
const
__m256i
mh
=
_mm256_set1_epi8
(
0x30
);
const
__m256i
mh
=
_mm256_set1_epi8
(
0x30
);
};
};
inline
__m256i
get_scale_shuffle_16
(
int
i
)
{
static
const
uint8_t
k_shuffle
[
128
]
=
{
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
};
return
_mm256_loadu_si256
((
const
__m256i
*
)
k_shuffle
+
i
);
}
inline
void
set_scales_16
(
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
3
));
}
template
<
typename
Dequantizer
,
int
nrc_y
>
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qY_K_q8_K_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
static
void
mul_mat_qY_K_q8_K_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
...
@@ -1000,6 +1978,8 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
...
@@ -1000,6 +1978,8 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
}
}
#endif // Zen4 or vanilla AVX2
#endif // Zen4 or vanilla AVX2
//
//
// ============================== Legacy quants
// ============================== Legacy quants
//
//
...
@@ -1075,6 +2055,28 @@ struct ScaleHelperQ_0 {
...
@@ -1075,6 +2055,28 @@ struct ScaleHelperQ_0 {
template
<
typename
Q
>
inline
float
prepare1
(
const
Q
*
y
)
const
{
return
GGML_FP16_TO_FP32
(
y
->
d
);
}
template
<
typename
Q
>
inline
float
prepare1
(
const
Q
*
y
)
const
{
return
GGML_FP16_TO_FP32
(
y
->
d
);
}
template
<
typename
Q
>
inline
float
prepare1
(
float
d
,
const
Q
*
y
)
const
{
return
d
*
prepare1
(
y
);
}
template
<
typename
Q
>
inline
float
prepare1
(
float
d
,
const
Q
*
y
)
const
{
return
d
*
prepare1
(
y
);
}
};
};
template
<
int
min_value
>
struct
ScaleHelperQ_0_1
{
ggml_half
scales8
[
4
];
template
<
typename
Q
>
inline
__m256
prepare4
(
const
Q
*
y
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
scales8
[
j
]
=
y
[
j
]
.
d
;
auto
s4
=
_mm_cvtph_ps
(
_mm_loadl_epi64
((
const
__m128i
*
)
scales8
));
return
_mm256_set_m128
(
_mm_mul_ps
(
s4
,
min
),
s4
);
}
template
<
typename
Q
>
inline
__m256
prepare4
(
__m256
other_scales
,
const
Q
*
y
)
{
return
_mm_mul256_ps
(
other_scales
,
prepare4
<
Q
>
(
y
));
}
template
<
typename
Q
>
inline
std
::
pair
<
float
,
float
>
prepare1
(
const
Q
*
y
)
const
{
float
d
=
GGML_FP16_TO_FP32
(
y
->
d
);
return
std
::
make_pair
(
d
,
-
d
*
float
(
min_value
));
}
std
::
pair
<
float
,
float
>
inline
prepare1
(
const
std
::
pair
<
float
,
float
>&
dm
,
const
block_q8_1
*
y
)
const
{
return
std
::
make_pair
(
dm
.
first
*
GGML_FP16_TO_FP32
(
y
->
d
),
dm
.
second
*
GGML_FP16_TO_FP32
(
y
->
s
));
}
const
__m128
min
=
_mm_set1_ps
(
float
(
-
min_value
));
};
struct
ScaleHelperQ_1
{
struct
ScaleHelperQ_1
{
uint32_t
scales8
[
4
];
uint32_t
scales8
[
4
];
...
@@ -1235,6 +2237,12 @@ struct Q8_0_Dequantizer {
...
@@ -1235,6 +2237,12 @@ struct Q8_0_Dequantizer {
}
}
};
};
struct
Q8_0_1_Dequantizer
{
inline
__m256i
dequant
(
const
block_q8_0
*
x
)
const
{
return
_mm256_add_epi8
(
_mm256_set1_epi8
(
127
),
_mm256_loadu_si256
((
const
__m256i
*
)
x
->
qs
));
}
};
struct
Q4_0_Dequantizer
{
struct
Q4_0_Dequantizer
{
Dequantizer4bit
b4
;
Dequantizer4bit
b4
;
const
__m256i
m8
=
_mm256_set1_epi8
(
-
8
);
const
__m256i
m8
=
_mm256_set1_epi8
(
-
8
);
...
@@ -1320,6 +2328,11 @@ struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_
...
@@ -1320,6 +2328,11 @@ struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_
Q8_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
Q8_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK4_0
;
}
inline
static
int
block_size
()
{
return
QK4_0
;
}
};
};
struct
Q8_0_1_Unpacker
final
:
public
Q_Unpacker
<
block_q8_0
,
ScaleHelperQ_0_1
<
127
>
,
Q8_0_1_Dequantizer
>
{
Q8_0_1_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
// using Sum4T = Sum4TypeQ81;
inline
static
int
block_size
()
{
return
QK8_0
;
}
};
struct
Q4_0_Unpacker
final
:
public
Q_Unpacker
<
block_q4_0
,
ScaleHelperQ_0
,
Q4_0_Dequantizer
>
{
struct
Q4_0_Unpacker
final
:
public
Q_Unpacker
<
block_q4_0
,
ScaleHelperQ_0
,
Q4_0_Dequantizer
>
{
Q4_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
Q4_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK4_0
;
}
inline
static
int
block_size
()
{
return
QK4_0
;
}
...
@@ -1353,8 +2366,466 @@ void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
...
@@ -1353,8 +2366,466 @@ void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
}
}
}
}
/*
moonll
add some structs for DequantizerIQ2XXS
SimpleBits
EvenSignHelper
*/
struct
SimpleBits
{
__m256i
values
[
4
];
};
struct
EvenSignHelper
{
#ifdef HAVE_FANCY_SIMD
union
sbits_t
{
__m128i
vec
;
__mmask32
mask
[
4
];
};
IQK_ALWAYS_INLINE
void
sign_2_values
(
__m256i
aux
,
__m256i
*
values
)
const
{
aux
=
_mm256_and_si256
(
_mm256_srlv_epi32
(
aux
,
shifts
),
mask
);
auto
pcnt
=
_mm256_popcnt_epi32
(
aux
);
sbits_t
sbits
;
sbits
.
vec
=
_mm256_cvtepi32_epi8
(
_mm256_or_si256
(
aux
,
_mm256_slli_epi32
(
_mm256_and_si256
(
pcnt
,
mone
),
7
)));
values
[
0
]
=
_mm256_mask_sub_epi8
(
values
[
0
],
sbits
.
mask
[
0
],
_mm256_setzero_si256
(),
values
[
0
]);
values
[
1
]
=
_mm256_mask_sub_epi8
(
values
[
1
],
sbits
.
mask
[
1
],
_mm256_setzero_si256
(),
values
[
1
]);
//auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
//const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
//values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
//values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
}
const
__m256i
shifts
=
_mm256_set_epi32
(
21
,
14
,
7
,
0
,
21
,
14
,
7
,
0
);
const
__m256i
mask
=
_mm256_set1_epi32
(
127
);
const
__m256i
mone
=
_mm256_set1_epi32
(
1
);
#else
inline
void
sign_value
(
uint32_t
aux32
,
__m256i
&
value
)
const
{
auto
signs
=
_mm256_set_epi64x
(
keven_signs
[(
aux32
>>
21
)
&
127
],
keven_signs
[(
aux32
>>
14
)
&
127
],
keven_signs
[(
aux32
>>
7
)
&
127
],
keven_signs
[(
aux32
>>
0
)
&
127
]);
value
=
_mm256_sign_epi8
(
value
,
signs
);
}
#endif
};
/*
moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
add func
get_scale_shuffle_8
get_scale_shuffle_16
set_scales_16
*/
inline
__m256i
get_scale_shuffle_8
(
int
i
)
{
return
_mm256_set1_epi16
((
2
*
i
)
|
((
2
*
i
+
1
)
<<
8
));
}
inline
void
set_scales_8
(
const
__m256i
&
all_scales
,
int
j
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
3
));
}
inline
__m256i
get_scale_shuffle_16
(
int
i
)
{
static
const
uint8_t
k_shuffle
[
128
]
=
{
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
};
return
_mm256_loadu_si256
((
const
__m256i
*
)
k_shuffle
+
i
);
}
inline
void
set_scales_16
(
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
3
));
}
template
<
typename
Q8
,
typename
Bits
>
inline
void
multiply_add
(
const
Bits
&
bits
,
const
__m256i
*
scales
,
int
j
,
int
i
,
const
Q8
&
q8
,
__m256i
*
sumi
)
{
if
(
j
==
0
)
{
#ifdef HAVE_FANCY_SIMD
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
_mm256_setzero_si256
(),
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
0
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
1
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
2
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
3
)));
}
#else
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
0
)));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
1
)));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
2
)));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
3
)));
sumi
[
iy
]
=
_mm256_add_epi32
(
_mm256_add_epi32
(
p1
,
p3
),
_mm256_add_epi32
(
p2
,
p4
));
}
#endif
}
else
{
#ifdef HAVE_FANCY_SIMD
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
4
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
5
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
6
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
7
)));
}
#else
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
4
)));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
5
)));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
6
)));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
7
)));
sumi
[
iy
]
=
_mm256_add_epi32
(
sumi
[
iy
],
_mm256_add_epi32
(
p1
,
p3
));
sumi
[
iy
]
=
_mm256_add_epi32
(
sumi
[
iy
],
_mm256_add_epi32
(
p2
,
p4
));
}
#endif
}
}
/*
moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
add func
set_scales_8_iq
set_scales_16_iq
add MUL_MAT
mul_mat_qX_K_q8_K_IQ_1
mul_mat_qX_K_q8_K_IQ_N
mul_mat_qX_K_q8_K_IQ
*/
template
<
typename
Bits
>
inline
void
multiply_add_1
(
int
j
,
const
Bits
&
bits
,
const
__m256i
*
scales
,
const
__m256i
*
q8
,
__m256i
*
sumi
)
{
if
(
j
==
0
)
{
#ifdef HAVE_FANCY_SIMD
auto
p1
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
0
],
q8
[
0
]);
auto
p2
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
1
],
q8
[
1
]);
auto
p3
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
2
],
q8
[
2
]);
auto
p4
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
3
],
q8
[
3
]);
sumi
[
0
]
=
_mm256_dpwssd_epi32
(
_mm256_setzero_si256
(),
scales
[
0
],
_mm256_packs_epi32
(
p1
,
p2
));
sumi
[
1
]
=
_mm256_dpwssd_epi32
(
_mm256_setzero_si256
(),
scales
[
1
],
_mm256_packs_epi32
(
p3
,
p4
));
#else
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
[
0
]));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
[
1
]));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
[
2
]));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
[
3
]));
sumi
[
0
]
=
_mm256_add_epi32
(
p1
,
p3
);
sumi
[
1
]
=
_mm256_add_epi32
(
p2
,
p4
);
#endif
}
else
{
#ifdef HAVE_FANCY_SIMD
auto
p1
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
0
],
q8
[
0
]);
auto
p2
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
1
],
q8
[
1
]);
auto
p3
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
2
],
q8
[
2
]);
auto
p4
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
3
],
q8
[
3
]);
sumi
[
0
]
=
_mm256_dpwssd_epi32
(
sumi
[
0
],
scales
[
0
],
_mm256_packs_epi32
(
p1
,
p2
));
sumi
[
1
]
=
_mm256_dpwssd_epi32
(
sumi
[
1
],
scales
[
1
],
_mm256_packs_epi32
(
p3
,
p4
));
#else
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
[
0
]));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
[
1
]));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
[
2
]));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
[
3
]));
sumi
[
0
]
=
_mm256_add_epi32
(
sumi
[
0
],
_mm256_add_epi32
(
p1
,
p3
));
sumi
[
1
]
=
_mm256_add_epi32
(
sumi
[
1
],
_mm256_add_epi32
(
p2
,
p4
));
#endif
}
}
inline
void
set_scales_8_iq
(
int
j
,
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
//#ifdef HAVE_FANCY_SIMD
auto
shuffle
=
j
==
0
?
_mm256_set_epi64x
(
0x0302030203020302
,
0x0100010001000100
,
0x0302030203020302
,
0x0100010001000100
)
:
_mm256_set_epi64x
(
0x0b0a0b0a0b0a0b0a
,
0x0908090809080908
,
0x0b0a0b0a0b0a0b0a
,
0x0908090809080908
);
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
shuffle
);
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
_mm256_add_epi8
(
shuffle
,
_mm256_set1_epi8
(
4
)));
//#else
// set_scales_8(all_scales, j, scales);
//#endif
}
inline
void
set_scales_16_iq
(
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
#ifdef HAVE_FANCY_SIMD
auto
shuffle
=
_mm256_set_epi64x
(
0x0706070607060706
,
0x0302030203020302
,
0x0504050405040504
,
0x0100010001000100
);
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
shuffle
);
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
_mm256_add_epi8
(
shuffle
,
_mm256_set1_epi8
(
8
)));
#else
set_scales_16
(
all_scales
,
scales
);
#endif
}
template
<
typename
Dequantizer
>
static
void
mul_mat_qX_K_q8_K_IQ_1
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
const
int
nb
=
n
/
QK_K
;
Q8
<
1
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256i
scales
[
2
];
__m256i
q8_quants
[
4
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
__m256
accd
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
__m256i
sumi
[
2
],
all_scales
[
Dequantizer
::
num_blocks
/
8
];
deq
.
new_block
(
i
,
all_scales
);
for
(
int
j
=
0
;
j
<
QK_K
/
128
;
++
j
)
{
deq
.
prepare
(
i
,
j
,
q8
,
q8_quants
);
if
constexpr
(
Dequantizer
::
num_blocks
==
8
)
{
set_scales_8_iq
(
j
,
all_scales
[
0
],
scales
);
}
else
{
set_scales_16_iq
(
all_scales
[
j
],
scales
);
}
multiply_add_1
(
j
,
deq
.
bits
,
scales
,
q8_quants
,
sumi
);
}
accd
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
deq
.
d
*
q8
.
scale
(
0
,
i
)),
_mm256_cvtepi32_ps
(
_mm256_add_epi32
(
sumi
[
0
],
sumi
[
1
])),
accd
);
}
info
.
store
(
ix
,
0
,
hsum_float_8
(
accd
));
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qX_K_q8_K_IQ_N
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256i
scales
[
4
];
__m256
accd
[
nrc_y
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
__m256i
sumi
[
nrc_y
],
all_scales
[
Dequantizer
::
num_blocks
/
8
];
//for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
__m256i
mins
;
float
dmin
=
deq
.
new_block
(
i
,
all_scales
,
mins
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
bsums
=
q8
.
load_bsums
(
iy
,
i
);
auto
prod
=
_mm256_madd_epi16
(
mins
,
bsums
);
accd
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
dmin
*
q8
.
scale
(
iy
,
i
)),
_mm256_cvtepi32_ps
(
prod
),
accd
[
iy
]);
}
for
(
int
j
=
0
;
j
<
QK_K
/
128
;
++
j
)
{
deq
.
prepare
(
i
,
j
);
if
constexpr
(
Dequantizer
::
num_blocks
==
8
)
{
set_scales_8
(
all_scales
[
0
],
j
,
scales
);
}
else
{
set_scales_16
(
all_scales
[
j
],
scales
);
}
//multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
multiply_add
(
deq
.
bits
,
scales
,
j
,
i
,
q8
,
sumi
);
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
const
__m256
vd
=
_mm256_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
));
accd
[
iy
]
=
_mm256_fmadd_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
[
iy
]),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
hsum_float_8
(
accd
[
iy
]));
}
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qX_K_q8_K_IQ
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
#ifdef HAVE_FANCY_SIMD
if
constexpr
(
nrc_y
==
1
)
{
mul_mat_qX_K_q8_K_IQ_1
<
Dequantizer
>
(
n
,
vx
,
bx
,
info
,
nrc_x
);
}
else
{
mul_mat_qX_K_q8_K_IQ_N
<
Dequantizer
,
nrc_y
>
(
n
,
vx
,
bx
,
info
,
nrc_x
);
}
#else
mul_mat_qX_K_q8_K_IQ_N
<
Dequantizer
,
nrc_y
>
(
n
,
vx
,
bx
,
info
,
nrc_x
);
#endif
}
/*
moonll iq1s
core func for iq1s mul_mat_iq1_s_q8_K
*/
template
<
int
nrc_y
>
static
void
mul_mat_iq1_s_q8_K
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
GGML_ASSERT
(
n
%
QK_K
==
0
);
Q8
<
nrc_y
,
block_q8_K
>
q8
(
info
);
__m256i
qx
[
8
];
__m256i
scales
[
4
];
__m256
acc
[
nrc_y
]
=
{};
auto
delta_mask
=
_mm_set1_epi16
(
-
32768
);
// to avoid stupid overflow warnings when using 0x8000
__m256i
shuffle0
=
_mm256_set_epi64x
(
0x0302030203020302
,
0x0100010001000100
,
0x0302030203020302
,
0x0100010001000100
);
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
auto
iq1s
=
(
const
block_iq1_s
*
)((
const
char
*
)
vx
+
ix
*
bx
);
for
(
int
ibl
=
0
;
ibl
<
n
/
QK_K
;
++
ibl
)
{
float
d
=
GGML_FP16_TO_FP32
(
iq1s
[
ibl
]
.
d
);
auto
qhb
=
_mm_loadu_si128
((
const
__m128i
*
)
iq1s
[
ibl
]
.
qh
);
auto
scales128
=
_mm_and_si128
(
_mm_srli_epi16
(
qhb
,
12
),
_mm_set1_epi16
(
7
));
scales128
=
_mm_add_epi16
(
_mm_slli_epi16
(
scales128
,
1
),
_mm_set1_epi16
(
1
));
#ifdef HAVE_FANCY_SIMD
auto
mask
=
_mm_cmpeq_epi16_mask
(
_mm_and_si128
(
qhb
,
delta_mask
),
delta_mask
);
auto
deltas128
=
_mm_mask_blend_epi16
(
mask
,
_mm_set1_epi16
(
-
7
),
_mm_set1_epi16
(
-
9
));
#else
auto
mask
=
_mm_cmpeq_epi16
(
_mm_and_si128
(
qhb
,
delta_mask
),
delta_mask
);
auto
deltas128
=
_mm_or_si128
(
_mm_and_si128
(
mask
,
_mm_set1_epi16
(
-
9
)),
_mm_andnot_si128
(
mask
,
_mm_set1_epi16
(
-
7
)));
#endif
deltas128
=
_mm_mullo_epi16
(
scales128
,
deltas128
);
scales128
=
_mm_slli_epi16
(
scales128
,
3
);
auto
deltas_l
=
_mm_unpacklo_epi16
(
deltas128
,
deltas128
);
auto
deltas_h
=
_mm_unpackhi_epi16
(
deltas128
,
deltas128
);
auto
deltas
=
MM256_SET_M128I
(
deltas_h
,
deltas_l
);
// blocks 0,0, 1,1, 2,2, ..., 7,7
auto
all_scales
=
MM256_SET_M128I
(
scales128
,
scales128
);
auto
shuffle
=
shuffle0
;
for
(
int
ib64
=
0
;
ib64
<
QK_K
/
64
;
++
ib64
)
{
scales
[
ib64
]
=
_mm256_shuffle_epi8
(
all_scales
,
shuffle
);
shuffle
=
_mm256_add_epi8
(
shuffle
,
_mm256_set1_epi8
(
4
));
}
const
uint8_t
*
qs
=
iq1s
[
ibl
]
.
qs
;
const
uint16_t
*
qh
=
iq1s
[
ibl
]
.
qh
;
for
(
int
ib
=
0
;
ib
<
QK_K
/
32
;
ib
+=
2
)
{
qx
[
ib
+
0
]
=
_mm256_set_epi64x
(
iq1s_grid_us
[
qs
[
3
]
|
((
qh
[
ib
+
0
]
>>
1
)
&
0x700
)],
iq1s_grid_us
[
qs
[
2
]
|
((
qh
[
ib
+
0
]
<<
2
)
&
0x700
)],
iq1s_grid_us
[
qs
[
1
]
|
((
qh
[
ib
+
0
]
<<
5
)
&
0x700
)],
iq1s_grid_us
[
qs
[
0
]
|
((
qh
[
ib
+
0
]
<<
8
)
&
0x700
)]);
qx
[
ib
+
1
]
=
_mm256_set_epi64x
(
iq1s_grid_us
[
qs
[
7
]
|
((
qh
[
ib
+
1
]
>>
1
)
&
0x700
)],
iq1s_grid_us
[
qs
[
6
]
|
((
qh
[
ib
+
1
]
<<
2
)
&
0x700
)],
iq1s_grid_us
[
qs
[
5
]
|
((
qh
[
ib
+
1
]
<<
5
)
&
0x700
)],
iq1s_grid_us
[
qs
[
4
]
|
((
qh
[
ib
+
1
]
<<
8
)
&
0x700
)]);
qs
+=
8
;
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
bsums
=
q8
.
load_bsums
(
iy
,
ibl
);
auto
sumi
=
_mm256_setzero_si256
();
for
(
int
ib64
=
0
;
ib64
<
QK_K
/
64
;
++
ib64
)
{
auto
qy1
=
q8
.
load_quants
(
iy
,
ibl
,
2
*
ib64
+
0
);
auto
qy2
=
q8
.
load_quants
(
iy
,
ibl
,
2
*
ib64
+
1
);
#ifdef HAVE_FANCY_SIMD
auto
dot1
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
qx
[
2
*
ib64
+
0
],
qy1
);
auto
dot2
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
qx
[
2
*
ib64
+
1
],
qy2
);
sumi
=
_mm256_dpwssd_epi32
(
sumi
,
scales
[
ib64
],
_mm256_packs_epi32
(
dot1
,
dot2
));
#else
auto
dot1
=
_mm256_maddubs_epi16
(
qx
[
2
*
ib64
+
0
],
qy1
);
auto
dot2
=
_mm256_maddubs_epi16
(
qx
[
2
*
ib64
+
1
],
qy2
);
auto
dot
=
_mm256_add_epi16
(
_mm256_unpacklo_epi64
(
dot1
,
dot2
),
_mm256_unpackhi_epi64
(
dot1
,
dot2
));
sumi
=
_mm256_add_epi32
(
sumi
,
_mm256_madd_epi16
(
scales
[
ib64
],
dot
));
#endif
}
#ifdef HAVE_FANCY_SIMD
sumi
=
_mm256_dpwssd_epi32
(
sumi
,
bsums
,
deltas
);
#else
sumi
=
_mm256_add_epi32
(
sumi
,
_mm256_madd_epi16
(
bsums
,
deltas
));
#endif
acc
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
d
*
q8
.
scale
(
iy
,
ibl
)),
_mm256_cvtepi32_ps
(
sumi
),
acc
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
0.125
f
*
hsum_float_8
(
acc
[
iy
]));
acc
[
iy
]
=
_mm256_setzero_ps
();
}
}
}
/*
moonll iq1s
DequantizerIQ2XXS
DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
*/
struct
DequantizerIQ2XXS
final
:
public
BaseDequantizer
<
block_iq2_xxs
>
{
DequantizerIQ2XXS
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
constexpr
static
int
num_blocks
=
8
;
union
Data
{
__m256i
vec
;
uint32_t
val
[
8
];
};
inline
__m128i
load_scales
(
int
i
)
{
d
=
0.125
f
*
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
const
uint16_t
*
a16
=
(
const
uint16_t
*
)
x
[
i
]
.
qs
;
auto
scales
=
_mm_srli_epi16
(
_mm_set_epi16
(
a16
[
31
],
a16
[
27
],
a16
[
23
],
a16
[
19
],
a16
[
15
],
a16
[
11
],
a16
[
7
],
a16
[
3
]),
12
);
return
_mm_or_si128
(
_mm_slli_epi16
(
scales
,
1
),
_mm_set1_epi16
(
1
));
}
inline
void
new_block
(
int
i
,
__m256i
*
scales
)
{
auto
sc16
=
load_scales
(
i
);
scales
[
0
]
=
MM256_SET_M128I
(
sc16
,
sc16
);
}
inline
float
new_block
(
int
i
,
__m256i
*
scales
,
__m256i
&
mins
)
{
auto
sc16
=
load_scales
(
i
);
mins
=
scb
.
shuffle
(
sc16
);
scales
[
0
]
=
MM256_SET_M128I
(
sc16
,
sc16
);
return
-
d
*
minv
;
}
inline
static
void
make4
(
const
uint32_t
*
aux32
,
__m256i
*
values
)
{
const
uint8_t
*
aux8
=
(
const
uint8_t
*
)
aux32
;
values
[
0
]
=
_mm256_set_epi64x
(
iq2xxs_grid
[
aux8
[
3
]],
iq2xxs_grid
[
aux8
[
2
]],
iq2xxs_grid
[
aux8
[
1
]],
iq2xxs_grid
[
aux8
[
0
]]);
values
[
1
]
=
_mm256_set_epi64x
(
iq2xxs_grid
[
aux8
[
11
]],
iq2xxs_grid
[
aux8
[
10
]],
iq2xxs_grid
[
aux8
[
9
]],
iq2xxs_grid
[
aux8
[
8
]]);
values
[
2
]
=
_mm256_set_epi64x
(
iq2xxs_grid
[
aux8
[
19
]],
iq2xxs_grid
[
aux8
[
18
]],
iq2xxs_grid
[
aux8
[
17
]],
iq2xxs_grid
[
aux8
[
16
]]);
values
[
3
]
=
_mm256_set_epi64x
(
iq2xxs_grid
[
aux8
[
27
]],
iq2xxs_grid
[
aux8
[
26
]],
iq2xxs_grid
[
aux8
[
25
]],
iq2xxs_grid
[
aux8
[
24
]]);
}
IQK_ALWAYS_INLINE
void
sign_values
(
const
uint32_t
*
aux32
,
__m256i
*
values
)
const
{
#ifdef HAVE_FANCY_SIMD
esh
.
sign_2_values
(
MM256_SET_M128I
(
_mm_set1_epi32
(
aux32
[
3
]),
_mm_set1_epi32
(
aux32
[
1
])),
values
+
0
);
esh
.
sign_2_values
(
MM256_SET_M128I
(
_mm_set1_epi32
(
aux32
[
7
]),
_mm_set1_epi32
(
aux32
[
5
])),
values
+
2
);
#else
esh
.
sign_value
(
aux32
[
1
],
values
[
0
]);
esh
.
sign_value
(
aux32
[
3
],
values
[
1
]);
esh
.
sign_value
(
aux32
[
5
],
values
[
2
]);
esh
.
sign_value
(
aux32
[
7
],
values
[
3
]);
#endif
}
inline
void
make4_signed
(
const
uint32_t
*
aux32
,
const
__m256i
&
min_value
,
__m256i
*
values
)
const
{
make4
(
aux32
,
values
);
sign_values
(
aux32
,
values
);
for
(
int
k
=
0
;
k
<
4
;
++
k
)
values
[
k
]
=
_mm256_add_epi8
(
values
[
k
],
min_value
);
}
inline
void
make4
(
const
uint32_t
*
aux32
,
__m256i
*
values
,
__m256i
*
q8
)
const
{
make4
(
aux32
,
values
);
sign_values
(
aux32
,
q8
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
Data
data
;
data
.
vec
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
]
.
qs
+
j
);
make4_signed
(
data
.
val
,
min_value
,
bits
.
values
);
}
inline
void
prepare
(
int
i
,
int
j
,
const
Q8
<
1
>&
q8
,
__m256i
*
q8_quants
)
{
for
(
int
k
=
0
;
k
<
4
;
++
k
)
q8_quants
[
k
]
=
q8
.
load_quants
(
0
,
i
,
4
*
j
+
k
);
Data
data
;
data
.
vec
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
]
.
qs
+
j
);
make4
(
data
.
val
,
bits
.
values
,
q8_quants
);
}
constexpr
static
int
minv
=
43
;
SimpleBits
bits
;
Scales8KBase
scb
;
EvenSignHelper
esh
;
const
__m256i
min_value
=
_mm256_set1_epi8
(
minv
);
const
__m256i
shuffle
=
_mm256_set_epi32
(
7
,
5
,
3
,
1
,
7
,
5
,
3
,
1
);
};
/*
moonll
add Q8_0_Unpacker && DequantizerIQ2XXS support
add func mul_mat_qX_K_q8_K_IQ
*/
template
<
typename
Dequantizer
>
void
MulMat
::
set_functions
(
MulMat
&
m
)
{
template
<
typename
Dequantizer
>
void
MulMat
::
set_functions
(
MulMat
&
m
)
{
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_0_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_0_Unpacker
>
)
{
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_0_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_0_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q8_0_Unpacker
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
0
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
3
>
;
...
@@ -1364,7 +2835,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
...
@@ -1364,7 +2835,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
m
.
funcs
[
6
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
8
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
8
>
;
}
}
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_1_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_1_Unpacker
>
)
{
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_1_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_1_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q8_0_1_Unpacker
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
0
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
3
>
;
...
@@ -1374,16 +2845,37 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
...
@@ -1374,16 +2845,37 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
m
.
funcs
[
6
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
8
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
8
>
;
}
}
else
{
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ2XXS
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
8
>
;
}
else
{
#ifdef HAVE_FANCY_SIMD
#ifdef HAVE_FANCY_SIMD
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
1
>
;
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ4XS
>
)
{
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
0
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
1
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
1
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
2
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
4
>
;
m
.
funcs
[
2
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
3
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
5
>
;
m
.
funcs
[
3
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
4
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
6
>
;
m
.
funcs
[
4
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
5
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
5
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
6
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
8
>
;
m
.
funcs
[
6
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
8
>
;
}
else
{
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_AVX512_1
<
Dequantizer
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
8
>
;
}
#else
#else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerQ2K
>
||
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerQ2K
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ3K
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ3K
>
||
...
@@ -1410,11 +2902,260 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
...
@@ -1410,11 +2902,260 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
}
}
}
}
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
)
{
struct
QFBase
{
#ifdef __AVX512F__
constexpr
static
int
k_step
=
16
;
using
Data
=
__m512
;
using
Acc
=
__m512
;
static
inline
Data
load
(
const
ggml_half
*
x
)
{
return
_mm512_cvtph_ps
(
_mm256_loadu_si256
((
const
__m256i
*
)
x
));
}
static
inline
Data
load
(
const
float
*
x
)
{
return
_mm512_loadu_ps
(
x
);
}
static
inline
Data
load
(
const
ggml_bf16_t
*
x
)
{
return
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
((
const
__m256i
*
)
x
)),
16
));
}
static
inline
Acc
acc
(
Acc
prev
,
const
Data
&
y
,
const
Data
&
x
)
{
return
_mm512_fmadd_ps
(
y
,
x
,
prev
);
}
static
inline
Acc
acc_first
(
const
Data
&
y
,
const
Data
&
x
)
{
return
_mm512_mul_ps
(
y
,
x
);
}
static
inline
Acc
add
(
Acc
x
,
Acc
y
)
{
return
_mm512_add_ps
(
x
,
y
);
}
static
inline
float
hsum
(
Acc
acc
)
{
return
_mm512_reduce_add_ps
(
acc
);
}
template
<
typename
Float
>
static
inline
Data
load4Floats
(
const
Float
*
x
)
{
return
_mm512_insertf32x4
(
_mm512_setzero_ps
(),
load128
(
x
),
0
);
}
static
inline
Acc
acc_r4
(
Acc
acc
,
const
Data
*
xv
,
const
Data
&
yv
)
{
acc
=
_mm512_fmadd_ps
(
xv
[
0
],
_mm512_shuffle_ps
(
yv
,
yv
,
0x00
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
1
],
_mm512_shuffle_ps
(
yv
,
yv
,
0x55
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
2
],
_mm512_shuffle_ps
(
yv
,
yv
,
0xaa
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
3
],
_mm512_shuffle_ps
(
yv
,
yv
,
0xff
),
acc
);
return
acc
;
}
static
inline
Acc
acc_r4_first
(
const
Data
*
xv
,
const
Data
&
yv
)
{
auto
acc
=
_mm512_mul_ps
(
xv
[
0
],
_mm512_shuffle_ps
(
yv
,
yv
,
0x00
));
acc
=
_mm512_fmadd_ps
(
xv
[
1
],
_mm512_shuffle_ps
(
yv
,
yv
,
0x55
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
2
],
_mm512_shuffle_ps
(
yv
,
yv
,
0xaa
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
3
],
_mm512_shuffle_ps
(
yv
,
yv
,
0xff
),
acc
);
return
acc
;
}
static
inline
__m128
hsum_r4
(
Acc
acc
)
{
auto
sum1
=
_mm_add_ps
(
_mm512_extractf32x4_ps
(
acc
,
0
),
_mm512_extractf32x4_ps
(
acc
,
1
));
auto
sum2
=
_mm_add_ps
(
_mm512_extractf32x4_ps
(
acc
,
2
),
_mm512_extractf32x4_ps
(
acc
,
3
));
return
_mm_add_ps
(
sum1
,
sum2
);
}
#else
constexpr
static
int
k_step
=
8
;
using
Data
=
__m256
;
using
Acc
=
__m256
;
static
inline
Data
load
(
const
ggml_half
*
x
)
{
return
_mm256_cvtph_ps
(
_mm_loadu_si128
((
const
__m128i
*
)
x
));
}
static
inline
Data
load
(
const
float
*
x
)
{
return
_mm256_loadu_ps
(
x
);
}
static
inline
Data
load
(
const
ggml_bf16_t
*
x
)
{
return
_mm256_castsi256_ps
(
_mm256_slli_epi32
(
_mm256_cvtepu16_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)
x
)),
16
));
}
static
inline
Acc
acc
(
Acc
prev
,
const
Data
&
y
,
const
Data
&
x
)
{
return
_mm256_fmadd_ps
(
y
,
x
,
prev
);
}
static
inline
Acc
add
(
Acc
x
,
Acc
y
)
{
return
_mm256_add_ps
(
x
,
y
);
}
static
inline
Acc
acc_r4
(
Acc
acc
,
const
Data
*
xv
,
const
Data
&
yv
)
{
acc
=
_mm256_fmadd_ps
(
xv
[
0
],
_mm256_shuffle_ps
(
yv
,
yv
,
0x00
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
1
],
_mm256_shuffle_ps
(
yv
,
yv
,
0x55
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
2
],
_mm256_shuffle_ps
(
yv
,
yv
,
0xaa
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
3
],
_mm256_shuffle_ps
(
yv
,
yv
,
0xff
),
acc
);
return
acc
;
}
static
inline
Acc
acc_r4_first
(
const
Data
*
xv
,
const
Data
&
yv
)
{
auto
acc
=
_mm256_mul_ps
(
xv
[
0
],
_mm256_shuffle_ps
(
yv
,
yv
,
0x00
));
acc
=
_mm256_fmadd_ps
(
xv
[
1
],
_mm256_shuffle_ps
(
yv
,
yv
,
0x55
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
2
],
_mm256_shuffle_ps
(
yv
,
yv
,
0xaa
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
3
],
_mm256_shuffle_ps
(
yv
,
yv
,
0xff
),
acc
);
return
acc
;
}
static
inline
Acc
acc_first
(
const
Data
&
y
,
const
Data
&
x
)
{
return
_mm256_mul_ps
(
y
,
x
);
}
static
inline
float
hsum
(
Acc
acc
)
{
return
hsum_float_8
(
acc
);
}
static
inline
__m128
hsum_r4
(
Acc
acc
)
{
return
_mm_add_ps
(
_mm256_castps256_ps128
(
acc
),
_mm256_extractf128_ps
(
acc
,
1
));
}
template
<
typename
Float
>
static
inline
Data
load4Floats
(
const
Float
*
x
)
{
return
_mm256_insertf128_ps
(
_mm256_setzero_ps
(),
load128
(
x
),
0
);
}
#endif
static
inline
__m128
load128
(
const
ggml_half
*
x
)
{
return
_mm_cvtph_ps
(
_mm_loadl_epi64
((
const
__m128i
*
)
x
));
}
static
inline
__m128
load128
(
const
float
*
x
)
{
return
_mm_loadu_ps
(
x
);
}
static
inline
__m128
load128
(
const
ggml_bf16_t
*
x
)
{
return
_mm_castsi128_ps
(
_mm_slli_epi32
(
_mm_cvtepu16_epi32
(
_mm_loadl_epi64
((
const
__m128i
*
)
x
)),
16
));
}
};
template
<
typename
Float
,
int
nrc_in
>
struct
QFT
final
:
public
QFBase
{
constexpr
static
int
nrc
=
nrc_in
;
QFT
(
const
DataInfo
&
info
)
{
for
(
int
iy
=
0
;
iy
<
nrc
;
++
iy
)
y
[
iy
]
=
(
const
Float
*
)
info
.
src1_row
(
iy
);
}
QFT
(
const
char
*
cx
,
size_t
bx
)
{
for
(
int
iy
=
0
;
iy
<
nrc
;
++
iy
)
y
[
iy
]
=
(
const
Float
*
)(
cx
+
iy
*
bx
);
}
IQK_ALWAYS_INLINE
Data
load1
(
int
iy
,
int
i
)
const
{
return
load
(
y
[
iy
]
+
k_step
*
i
);
}
IQK_ALWAYS_INLINE
Data
load_tail
(
int
iy
,
int
i
)
const
{
return
load4Floats
(
y
[
iy
]
+
4
*
i
);
}
IQK_ALWAYS_INLINE
void
load_r4
(
int
ix
,
int
i
,
Data
*
xv
)
const
{
xv
[
0
]
=
load1
(
ix
+
0
,
i
);
xv
[
1
]
=
load1
(
ix
+
1
,
i
);
xv
[
2
]
=
load1
(
ix
+
2
,
i
);
xv
[
3
]
=
load1
(
ix
+
3
,
i
);
#ifdef __AVX512F__
auto
t0
=
_mm512_unpacklo_ps
(
xv
[
0
],
xv
[
1
]);
auto
t1
=
_mm512_unpacklo_ps
(
xv
[
2
],
xv
[
3
]);
auto
t2
=
_mm512_unpackhi_ps
(
xv
[
0
],
xv
[
1
]);
auto
t3
=
_mm512_unpackhi_ps
(
xv
[
2
],
xv
[
3
]);
xv
[
0
]
=
_mm512_castpd_ps
(
_mm512_unpacklo_pd
(
_mm512_castps_pd
(
t0
),
_mm512_castps_pd
(
t1
)));
xv
[
1
]
=
_mm512_castpd_ps
(
_mm512_unpackhi_pd
(
_mm512_castps_pd
(
t0
),
_mm512_castps_pd
(
t1
)));
xv
[
2
]
=
_mm512_castpd_ps
(
_mm512_unpacklo_pd
(
_mm512_castps_pd
(
t2
),
_mm512_castps_pd
(
t3
)));
xv
[
3
]
=
_mm512_castpd_ps
(
_mm512_unpackhi_pd
(
_mm512_castps_pd
(
t2
),
_mm512_castps_pd
(
t3
)));
#else
auto
t0
=
_mm256_unpacklo_ps
(
xv
[
0
],
xv
[
1
]);
auto
t1
=
_mm256_unpacklo_ps
(
xv
[
2
],
xv
[
3
]);
auto
t2
=
_mm256_unpackhi_ps
(
xv
[
0
],
xv
[
1
]);
auto
t3
=
_mm256_unpackhi_ps
(
xv
[
2
],
xv
[
3
]);
xv
[
0
]
=
_mm256_castpd_ps
(
_mm256_unpacklo_pd
(
_mm256_castps_pd
(
t0
),
_mm256_castps_pd
(
t1
)));
xv
[
1
]
=
_mm256_castpd_ps
(
_mm256_unpackhi_pd
(
_mm256_castps_pd
(
t0
),
_mm256_castps_pd
(
t1
)));
xv
[
2
]
=
_mm256_castpd_ps
(
_mm256_unpacklo_pd
(
_mm256_castps_pd
(
t2
),
_mm256_castps_pd
(
t3
)));
xv
[
3
]
=
_mm256_castpd_ps
(
_mm256_unpackhi_pd
(
_mm256_castps_pd
(
t2
),
_mm256_castps_pd
(
t3
)));
#endif
}
const
Float
*
y
[
nrc
];
};
template
<
typename
Qy
,
typename
Qx
>
IQK_NOINLINE
void
mul_mat_Qx_Qy_MxN
(
int
n
,
const
char
*
cx
,
size_t
bx
,
int
ix0
,
const
DataInfo
&
info
)
{
int
nb
=
n
/
QFBase
::
k_step
;
int
nb4
=
n
/
4
;
Qy
y
(
info
);
Qx
x
(
cx
+
ix0
*
bx
,
bx
);
QFBase
::
Data
xv
[
Qx
::
nrc
];
QFBase
::
Acc
acc
[
Qx
::
nrc
*
Qy
::
nrc
];
auto
yv
=
y
.
load1
(
0
,
0
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
{
xv
[
ix
]
=
x
.
load1
(
ix
,
0
);
acc
[
ix
]
=
QFBase
::
acc_first
(
yv
,
xv
[
ix
]);
}
for
(
int
iy
=
1
;
iy
<
Qy
::
nrc
;
++
iy
)
{
yv
=
y
.
load1
(
iy
,
0
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
acc
[
Qx
::
nrc
*
iy
+
ix
]
=
QFBase
::
acc_first
(
yv
,
xv
[
ix
]);
}
for
(
int
i
=
1
;
i
<
nb
;
++
i
)
{
yv
=
y
.
load1
(
0
,
i
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
{
xv
[
ix
]
=
x
.
load1
(
ix
,
i
);
acc
[
ix
]
=
QFBase
::
acc
(
acc
[
ix
],
yv
,
xv
[
ix
]);
}
for
(
int
iy
=
1
;
iy
<
Qy
::
nrc
;
++
iy
)
{
yv
=
y
.
load1
(
iy
,
i
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
acc
[
Qx
::
nrc
*
iy
+
ix
]
=
QFBase
::
acc
(
acc
[
Qx
::
nrc
*
iy
+
ix
],
yv
,
xv
[
ix
]);
}
}
for
(
int
i
=
(
QFBase
::
k_step
/
4
)
*
nb
;
i
<
nb4
;
++
i
)
{
yv
=
y
.
load_tail
(
0
,
i
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
{
xv
[
ix
]
=
x
.
load_tail
(
ix
,
i
);
acc
[
ix
]
=
QFBase
::
acc
(
acc
[
ix
],
yv
,
xv
[
ix
]);
}
for
(
int
iy
=
1
;
iy
<
Qy
::
nrc
;
++
iy
)
{
yv
=
y
.
load_tail
(
iy
,
i
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
acc
[
Qx
::
nrc
*
iy
+
ix
]
=
QFBase
::
acc
(
acc
[
Qx
::
nrc
*
iy
+
ix
],
yv
,
xv
[
ix
]);
}
}
for
(
int
iy
=
0
;
iy
<
Qy
::
nrc
;
++
iy
)
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
info
.
store
(
ix0
+
ix
,
iy
,
QFBase
::
hsum
(
acc
[
Qx
::
nrc
*
iy
+
ix
]));
}
// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
template
<
int
nrc_y
,
typename
FloatX
,
typename
FloatY
>
void
mul_mat_fX_fY_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
const
char
*
cx
=
(
const
char
*
)
vx
;
// TBD if we want this
//if constexpr (nrc_y == 1) {
// constexpr int k_nx = 2;
// for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
// mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
// }
// if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
// int nx = nrc_x - lastx;
// switch (nx) {
// case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
// case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
// case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
// }
// //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
// }
// return;
//}
#ifdef __AVX512F__
constexpr
int
k_nx
=
5
;
#else
constexpr
int
k_nx
=
nrc_y
==
1
?
4
:
2
;
#endif
for
(
int
ix
=
0
;
ix
<
nrc_x
/
k_nx
;
++
ix
)
{
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
k_nx
>>
(
n
,
cx
,
bx
,
ix
*
k_nx
,
info
);
}
int
last_x
=
k_nx
*
(
nrc_x
/
k_nx
);
if
(
last_x
==
nrc_x
)
return
;
int
nx
=
nrc_x
-
last_x
;
#ifdef __AVX512F__
switch
(
nx
)
{
case
1
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
1
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
2
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
2
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
3
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
3
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
4
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
4
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
}
#else
if
constexpr
(
nrc_y
==
1
)
{
switch
(
nx
)
{
case
1
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
1
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
2
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
2
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
3
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
3
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
}
}
else
{
switch
(
nx
)
{
case
1
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
1
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
}
}
#endif
}
template
<
typename
FloatX
,
typename
FloatY
>
void
set_mul_mat_f
(
MulMat
&
mm
)
{
for
(
auto
&
f
:
mm
.
funcs
)
f
=
nullptr
;
mm
.
funcs
[
0
]
=
mul_mat_fX_fY_T
<
1
,
FloatX
,
FloatY
>
;
mm
.
funcs
[
1
]
=
mul_mat_fX_fY_T
<
2
,
FloatX
,
FloatY
>
;
mm
.
funcs
[
2
]
=
mul_mat_fX_fY_T
<
3
,
FloatX
,
FloatY
>
;
mm
.
funcs
[
3
]
=
mul_mat_fX_fY_T
<
4
,
FloatX
,
FloatY
>
;
mm
.
funcs
[
4
]
=
mul_mat_fX_fY_T
<
5
,
FloatX
,
FloatY
>
;
#ifndef __AVX512F__
mm
.
funcs
[
5
]
=
mul_mat_fX_fY_T
<
6
,
FloatX
,
FloatY
>
;
#endif
}
if
(
ne00
%
ggml_blck_size
(
GGML_TYPE_Q8_K
)
==
0
)
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_K
,
ne00
);
/*
moonll
add typeb TO compare return not expected type of weight matrix
add IQ2XSS
add IQ1_S
add GGML_TYPE_IQ4_XS
*/
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
typeB
,
int
ne00
,
MulMat
&
mm
,
int
Ny
)
{
(
void
)
Ny
;
auto
expected_typeB
=
GGML_TYPE_Q8_K
;
switch
(
typeA
)
{
switch
(
typeA
)
{
case
GGML_TYPE_Q2_K
:
case
GGML_TYPE_Q2_K
:
assert
(
ne00
%
QK_K
==
0
);
assert
(
ne00
%
QK_K
==
0
);
...
@@ -1440,37 +3181,75 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int)
...
@@ -1440,37 +3181,75 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int)
assert
(
ne00
%
QK_K
==
0
);
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerIQ4XS
>
(
mm
);
MulMat
::
set_functions
<
DequantizerIQ4XS
>
(
mm
);
break
;
break
;
case
GGML_TYPE_IQ2_XXS
:
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerIQ2XXS
>
(
mm
);
break
;
case
GGML_TYPE_Q4_0
:
case
GGML_TYPE_Q4_0
:
assert
(
ne00
%
QK4_0
==
0
);
assert
(
ne00
%
QK4_0
==
0
);
MulMat
::
set_functions
<
Q4_0_Unpacker
>
(
mm
);
MulMat
::
set_functions
<
Q4_0_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
)
;
expected_typeB
=
GGML_TYPE_Q8_0
;
break
;
break
;
case
GGML_TYPE_Q4_1
:
case
GGML_TYPE_Q4_1
:
assert
(
ne00
%
QK4_1
==
0
);
assert
(
ne00
%
QK4_1
==
0
);
MulMat
::
set_functions
<
Q4_1_Unpacker
>
(
mm
);
MulMat
::
set_functions
<
Q4_1_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_1
,
ne00
)
;
expected_typeB
=
GGML_TYPE_Q8_1
_X4
;
break
;
break
;
case
GGML_TYPE_Q5_0
:
case
GGML_TYPE_Q5_0
:
assert
(
ne00
%
QK5_0
==
0
);
assert
(
ne00
%
QK5_0
==
0
);
MulMat
::
set_functions
<
Q5_0_Unpacker
>
(
mm
);
MulMat
::
set_functions
<
Q5_0_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
)
;
expected_typeB
=
GGML_TYPE_Q8_0
;
break
;
break
;
case
GGML_TYPE_Q5_1
:
case
GGML_TYPE_Q5_1
:
assert
(
ne00
%
QK5_1
==
0
);
assert
(
ne00
%
QK5_1
==
0
);
MulMat
::
set_functions
<
Q5_1_Unpacker
>
(
mm
);
MulMat
::
set_functions
<
Q5_1_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_1
,
ne00
);
expected_typeB
=
GGML_TYPE_Q8_1_X4
;
break
;
case
GGML_TYPE_Q8_0
:
assert
(
ne00
%
QK8_0
==
0
);
#ifdef HAVE_FANCY_SIMD
MulMat
::
set_functions
<
Q8_0_1_Unpacker
>
(
mm
);
expected_typeB
=
GGML_TYPE_Q8_1_X4
;
#else
MulMat
::
set_functions
<
Q8_0_Unpacker
>
(
mm
);
expected_typeB
=
GGML_TYPE_Q8_0_X4
;
#endif
break
;
case
GGML_TYPE_IQ1_S
:
mm
.
funcs
[
0
]
=
mul_mat_iq1_s_q8_K
<
1
>
;
mm
.
funcs
[
1
]
=
mul_mat_iq1_s_q8_K
<
2
>
;
mm
.
funcs
[
2
]
=
mul_mat_iq1_s_q8_K
<
3
>
;
mm
.
funcs
[
3
]
=
mul_mat_iq1_s_q8_K
<
4
>
;
mm
.
funcs
[
4
]
=
mul_mat_iq1_s_q8_K
<
5
>
;
mm
.
funcs
[
5
]
=
mul_mat_iq1_s_q8_K
<
6
>
;
mm
.
funcs
[
6
]
=
mul_mat_iq1_s_q8_K
<
7
>
;
mm
.
funcs
[
7
]
=
mul_mat_iq1_s_q8_K
<
8
>
;
#ifdef HAVE_FANCY_SIMD
mm
.
func16
=
mul_mat_iq1_s_q8_K
<
16
>
;
#endif
// row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
expected_typeB
=
GGML_TYPE_Q8_K
;
break
;
break
;
default
:
default
:
{
printf
(
"case:%d"
,
typeA
);
return
false
;
return
false
;
}
}
}
return
true
;
return
ggml_type
(
typeB
)
==
expected_typeB
;
}
}
}
// namespace
}
// namespace
/*
iq1_s is not support for arm
*/
#else // __aarch64__
#else // __aarch64__
namespace
{
namespace
{
...
...
third_party/llamafile/sgemm.h
View file @
216a63b8
...
@@ -12,10 +12,15 @@ extern "C" {
...
@@ -12,10 +12,15 @@ extern "C" {
struct
ggml_tensor
;
struct
ggml_tensor
;
struct
ggml_compute_params
;
struct
ggml_compute_params
;
/*moonll old
add more params typeb...
*/
bool
iqk_mul_mat
(
long
,
long
,
long
,
int
,
const
void
*
,
long
,
int
,
const
void
*
,
long
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_zen4
(
long
,
long
,
long
,
int
,
const
void
*
,
long
,
int
,
const
void
*
,
long
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_arm82
(
long
,
long
,
long
,
int
,
const
void
*
,
long
,
int
,
const
void
*
,
long
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat
(
long
,
long
,
long
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_zen4
(
long
,
long
,
long
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_arm82
(
long
,
long
,
long
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_moe
(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
bool
iqk_mul_mat_moe
(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
bool
iqk_mul_mat_moe_zen4
(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
bool
iqk_mul_mat_moe_zen4
(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
...
...
third_party/llamafile/tinyblas_cpu_sgemm.inc
View file @
216a63b8
...
@@ -323,20 +323,17 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
...
@@ -323,20 +323,17 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
#if QK_K == 256
#if QK_K == 256
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
// if (X86_CHECK(AVX2) && X86_CHECK(FMA)) {
/*
if
(
Btype
==
GGML_TYPE_Q8_K
&&
Ctype
==
GGML_TYPE_F32
)
{
moonll
if
(
iqk_mul_mat
(
m
,
n
,
k
*
QK_K
,
Atype
,
A
,
B
,
(
float
*
)
C
,
ldc
,
ith
,
nth
))
{
more Btype accept
return
true
;
}*/
}
}
if
(
Ctype
==
GGML_TYPE_F32
){
if
((
Btype
==
GGML_TYPE_Q8_0
||
Btype
==
GGML_TYPE_Q8_1
)
&&
Ctype
==
GGML_TYPE_F32
)
{
if
(
iqk_mul_mat
(
m
,
n
,
k
*
ggml_blck_size
(
ggml_type
(
Atype
)),
Atype
,
A
,
lda
,
Btype
,
B
,
ldb
,
(
float
*
)
C
,
ldc
,
ith
,
nth
))
{
// assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
assert
((
QK8_0
==
32
)
&&
(
QK8_1
==
32
)
&&
(
QK4_0
==
32
)
&&
(
QK4_1
==
32
)
&&
(
QK5_0
==
32
)
&&
(
QK5_1
==
32
));
if
(
iqk_mul_mat
(
m
,
n
,
k
*
QK8_0
,
Atype
,
A
,
B
,
(
float
*
)
C
,
ldc
,
ith
,
nth
))
{
return
true
;
return
true
;
}
}
}
}
// }
#endif
#endif
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
if
(
Btype
==
GGML_TYPE_Q8_K
&&
Ctype
==
GGML_TYPE_F32
)
{
if
(
Btype
==
GGML_TYPE_Q8_K
&&
Ctype
==
GGML_TYPE_F32
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment