Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
216a63b8
Unverified
Commit
216a63b8
authored
Mar 03, 2025
by
Azure
Committed by
GitHub
Mar 03, 2025
Browse files
Merge pull request #754 from moonshadow-25/dev023
Support for IQ1_S(Dynamic 1.58-bit)
parents
798e1d0c
d24d3693
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1870 additions
and
89 deletions
+1870
-89
third_party/llamafile/iqk_mul_mat.inc
third_party/llamafile/iqk_mul_mat.inc
+1854
-75
third_party/llamafile/sgemm.h
third_party/llamafile/sgemm.h
+8
-3
third_party/llamafile/tinyblas_cpu_sgemm.inc
third_party/llamafile/tinyblas_cpu_sgemm.inc
+8
-11
No files found.
third_party/llamafile/iqk_mul_mat.inc
View file @
216a63b8
...
@@ -69,6 +69,10 @@
...
@@ -69,6 +69,10 @@
#endif
#endif
constexpr
ggml_type
GGML_TYPE_Q8_0_X4
=
static_cast
<
ggml_type
>
(
98
);
constexpr
ggml_type
GGML_TYPE_Q8_1_X4
=
static_cast
<
ggml_type
>
(
99
);
namespace
{
namespace
{
typedef
struct
{
typedef
struct
{
...
@@ -106,13 +110,36 @@ struct DataInfo {
...
@@ -106,13 +110,36 @@ struct DataInfo {
}
}
};
};
/*
moonll
change param for set_mul_mat
add func16
*/
typedef
void
(
*
mul_mat_t
)(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
);
typedef
void
(
*
mul_mat_t
)(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
);
struct
MulMat
{
struct
MulMat
{
std
::
array
<
mul_mat_t
,
8
>
funcs
=
{};
std
::
array
<
mul_mat_t
,
8
>
funcs
=
{};
mul_mat_t
func16
=
nullptr
;
//inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
//inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
IQK_NOINLINE
void
mul_mat_NxM
(
int
n
,
const
void
*
vx
,
size_t
bx
,
DataInfo
&
info
,
int
nrc_x
,
int
nrc_y
)
{
IQK_NOINLINE
void
mul_mat_NxM
(
int
n
,
const
void
*
vx
,
size_t
bx
,
DataInfo
&
info
,
int
nrc_x
,
int
nrc_y
)
{
constexpr
int
k_x_step
=
64
;
// This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
constexpr
int
k_x_step
=
64
;
// This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
if
(
func16
&&
nrc_y
>=
16
)
{
int
n_step
=
(
nrc_y
-
info
.
cur_y
)
/
16
;
for
(
int
ix
=
0
;
ix
<
nrc_x
;
ix
+=
k_x_step
)
{
auto
this_info
=
info
;
this_info
.
s
+=
ix
;
int
this_nrc_x
=
ix
+
k_x_step
<=
nrc_x
?
k_x_step
:
nrc_x
-
ix
;
for
(
int
iy
=
0
;
iy
<
n_step
;
++
iy
)
{
func16
(
n
,
(
const
void
*
)((
const
char
*
)
vx
+
ix
*
bx
),
bx
,
this_info
,
this_nrc_x
);
this_info
.
cur_y
+=
16
;
}
}
info
.
cur_y
+=
16
*
n_step
;
if
(
info
.
cur_y
==
nrc_y
)
return
;
}
int
n_step
=
(
nrc_y
-
info
.
cur_y
)
/
funcs
.
size
();
int
n_step
=
(
nrc_y
-
info
.
cur_y
)
/
funcs
.
size
();
if
(
n_step
>
0
)
{
if
(
n_step
>
0
)
{
for
(
int
ix
=
0
;
ix
<
nrc_x
;
ix
+=
k_x_step
)
{
for
(
int
ix
=
0
;
ix
<
nrc_x
;
ix
+=
k_x_step
)
{
...
@@ -131,7 +158,7 @@ struct MulMat {
...
@@ -131,7 +158,7 @@ struct MulMat {
funcs
[
n_left
-
1
](
n
,
vx
,
bx
,
info
,
nrc_x
);
funcs
[
n_left
-
1
](
n
,
vx
,
bx
,
info
,
nrc_x
);
}
}
}
}
static
IQK_NOINLINE
bool
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
Ny
);
static
IQK_NOINLINE
bool
set_mul_mat
(
int
typeA
,
int
typeB
,
int
ne00
,
MulMat
&
mm
,
int
Ny
);
private
:
private
:
template
<
typename
Dequantizer
>
static
IQK_NOINLINE
void
set_functions
(
MulMat
&
m
);
template
<
typename
Dequantizer
>
static
IQK_NOINLINE
void
set_functions
(
MulMat
&
m
);
};
};
...
@@ -147,6 +174,787 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
...
@@ -147,6 +174,787 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
aux32
[
0
]
=
a0
&
0x3f3f3f3f
;
aux32
[
0
]
=
a0
&
0x3f3f3f3f
;
}
}
/*
moonll
decoding tables
*/
#ifdef __AVX2__
static
const
uint64_t
iq1s_grid_us
[
2048
]
=
{
0x0000000000000000
,
0x0000000000000002
,
0x0000000000000101
,
0x0000000000000200
,
0x0000000000000202
,
0x0000000000010001
,
0x0000000000010101
,
0x0000000000020000
,
0x0000000000020002
,
0x0000000000020200
,
0x0000000000020202
,
0x0000000001000101
,
0x0000000001010001
,
0x0000000001010100
,
0x0000000001010102
,
0x0000000001020101
,
0x0000000002000000
,
0x0000000002000002
,
0x0000000002000200
,
0x0000000002000202
,
0x0000000002010101
,
0x0000000002020000
,
0x0000000002020002
,
0x0000000002020200
,
0x0000000002020202
,
0x0000000100000100
,
0x0000000100000101
,
0x0000000100010001
,
0x0000000100010100
,
0x0000000100010102
,
0x0000000100010201
,
0x0000000100010202
,
0x0000000100020101
,
0x0000000101000001
,
0x0000000101000102
,
0x0000000101000201
,
0x0000000101010002
,
0x0000000101010101
,
0x0000000101010202
,
0x0000000101020001
,
0x0000000101020100
,
0x0000000101020102
,
0x0000000101020200
,
0x0000000102000101
,
0x0000000102010001
,
0x0000000102010100
,
0x0000000102010102
,
0x0000000102020101
,
0x0000000200000000
,
0x0000000200000002
,
0x0000000200000200
,
0x0000000200000202
,
0x0000000200010101
,
0x0000000200020000
,
0x0000000200020002
,
0x0000000200020200
,
0x0000000200020202
,
0x0000000201000101
,
0x0000000201010001
,
0x0000000201010201
,
0x0000000201020100
,
0x0000000201020201
,
0x0000000202000000
,
0x0000000202000002
,
0x0000000202000200
,
0x0000000202000202
,
0x0000000202010001
,
0x0000000202010101
,
0x0000000202010201
,
0x0000000202020000
,
0x0000000202020002
,
0x0000000202020200
,
0x0000000202020202
,
0x0000010000010001
,
0x0000010000010100
,
0x0000010000010102
,
0x0000010000020101
,
0x0000010001000001
,
0x0000010001000201
,
0x0000010001010101
,
0x0000010001010202
,
0x0000010001020100
,
0x0000010001020101
,
0x0000010002010001
,
0x0000010002010201
,
0x0000010002020101
,
0x0000010100000001
,
0x0000010100000100
,
0x0000010100000101
,
0x0000010100000102
,
0x0000010100010101
,
0x0000010100010200
,
0x0000010100010202
,
0x0000010100020201
,
0x0000010101000000
,
0x0000010101000101
,
0x0000010101000202
,
0x0000010101010000
,
0x0000010101010001
,
0x0000010101010100
,
0x0000010101010101
,
0x0000010101010102
,
0x0000010101010201
,
0x0000010101020000
,
0x0000010101020002
,
0x0000010101020101
,
0x0000010101020200
,
0x0000010101020202
,
0x0000010102000001
,
0x0000010102010001
,
0x0000010102010101
,
0x0000010102010200
,
0x0000010102010202
,
0x0000010102020001
,
0x0000010102020100
,
0x0000010102020101
,
0x0000010102020102
,
0x0000010102020201
,
0x0000010200010100
,
0x0000010200010201
,
0x0000010201000001
,
0x0000010201000100
,
0x0000010201010000
,
0x0000010201010002
,
0x0000010201010101
,
0x0000010201010200
,
0x0000010201020000
,
0x0000010201020001
,
0x0000010201020102
,
0x0000010201020201
,
0x0000010202000101
,
0x0000010202010001
,
0x0000010202010100
,
0x0000010202010201
,
0x0000020000000000
,
0x0000020000000002
,
0x0000020000000200
,
0x0000020000000202
,
0x0000020000010101
,
0x0000020000020000
,
0x0000020000020002
,
0x0000020000020200
,
0x0000020000020202
,
0x0000020001000101
,
0x0000020001010001
,
0x0000020001010102
,
0x0000020001020101
,
0x0000020002000000
,
0x0000020002000002
,
0x0000020002000200
,
0x0000020002000202
,
0x0000020002010101
,
0x0000020002020000
,
0x0000020002020002
,
0x0000020002020200
,
0x0000020002020202
,
0x0000020100000101
,
0x0000020100010001
,
0x0000020100010100
,
0x0000020100010201
,
0x0000020100020100
,
0x0000020100020101
,
0x0000020101000001
,
0x0000020101010000
,
0x0000020101010001
,
0x0000020101010101
,
0x0000020101020001
,
0x0000020101020100
,
0x0000020101020201
,
0x0000020102010001
,
0x0000020102010100
,
0x0000020102010102
,
0x0000020102010201
,
0x0000020102020101
,
0x0000020200000000
,
0x0000020200000002
,
0x0000020200000200
,
0x0000020200000202
,
0x0000020200010101
,
0x0000020200020000
,
0x0000020200020002
,
0x0000020200020200
,
0x0000020200020202
,
0x0000020201000101
,
0x0000020201010001
,
0x0000020201010201
,
0x0000020201020001
,
0x0000020201020101
,
0x0000020202000000
,
0x0000020202000002
,
0x0000020202000101
,
0x0000020202000200
,
0x0000020202000202
,
0x0000020202010101
,
0x0000020202020000
,
0x0000020202020002
,
0x0000020202020200
,
0x0000020202020202
,
0x0001000000010000
,
0x0001000000010001
,
0x0001000000010100
,
0x0001000000010201
,
0x0001000000020100
,
0x0001000000020101
,
0x0001000001000001
,
0x0001000001000100
,
0x0001000001010000
,
0x0001000001010101
,
0x0001000001010200
,
0x0001000001020001
,
0x0001000001020100
,
0x0001000001020101
,
0x0001000001020201
,
0x0001000002010001
,
0x0001000002010100
,
0x0001000002010102
,
0x0001000002020001
,
0x0001000002020101
,
0x0001000100000001
,
0x0001000100000100
,
0x0001000100000102
,
0x0001000100000201
,
0x0001000100010000
,
0x0001000100010002
,
0x0001000100010101
,
0x0001000100010200
,
0x0001000100020001
,
0x0001000100020100
,
0x0001000100020201
,
0x0001000101000101
,
0x0001000101000202
,
0x0001000101010000
,
0x0001000101010001
,
0x0001000101010002
,
0x0001000101010100
,
0x0001000101010101
,
0x0001000101010102
,
0x0001000101010201
,
0x0001000101020000
,
0x0001000101020101
,
0x0001000102000100
,
0x0001000102010002
,
0x0001000102010101
,
0x0001000102020001
,
0x0001000102020100
,
0x0001000200010001
,
0x0001000200010100
,
0x0001000200010102
,
0x0001000200020101
,
0x0001000201000000
,
0x0001000201000102
,
0x0001000201000201
,
0x0001000201010002
,
0x0001000201010101
,
0x0001000201010200
,
0x0001000201010202
,
0x0001000201020100
,
0x0001000201020102
,
0x0001000202000101
,
0x0001000202010001
,
0x0001000202010100
,
0x0001000202010102
,
0x0001000202020101
,
0x0001010000000001
,
0x0001010000000102
,
0x0001010000000201
,
0x0001010000010100
,
0x0001010000010101
,
0x0001010000010200
,
0x0001010000010201
,
0x0001010000020001
,
0x0001010000020102
,
0x0001010001000001
,
0x0001010001000101
,
0x0001010001000102
,
0x0001010001000200
,
0x0001010001000202
,
0x0001010001010001
,
0x0001010001010100
,
0x0001010001010101
,
0x0001010001010102
,
0x0001010001010201
,
0x0001010001020002
,
0x0001010001020101
,
0x0001010001020200
,
0x0001010002000100
,
0x0001010002000201
,
0x0001010002010000
,
0x0001010002010100
,
0x0001010002010101
,
0x0001010002010200
,
0x0001010002010201
,
0x0001010002010202
,
0x0001010002020001
,
0x0001010002020100
,
0x0001010002020101
,
0x0001010002020201
,
0x0001010100000002
,
0x0001010100000101
,
0x0001010100000202
,
0x0001010100010001
,
0x0001010100010100
,
0x0001010100010101
,
0x0001010100010102
,
0x0001010100010201
,
0x0001010100020000
,
0x0001010100020002
,
0x0001010100020101
,
0x0001010100020200
,
0x0001010100020202
,
0x0001010101000001
,
0x0001010101000100
,
0x0001010101000101
,
0x0001010101000102
,
0x0001010101010001
,
0x0001010101010002
,
0x0001010101010100
,
0x0001010101010101
,
0x0001010101010102
,
0x0001010101010201
,
0x0001010101010202
,
0x0001010101020001
,
0x0001010101020100
,
0x0001010101020101
,
0x0001010101020102
,
0x0001010101020201
,
0x0001010102000000
,
0x0001010102000002
,
0x0001010102000100
,
0x0001010102000101
,
0x0001010102000200
,
0x0001010102000202
,
0x0001010102010000
,
0x0001010102010001
,
0x0001010102010100
,
0x0001010102010101
,
0x0001010102010102
,
0x0001010102010201
,
0x0001010102010202
,
0x0001010102020000
,
0x0001010102020002
,
0x0001010102020101
,
0x0001010200000001
,
0x0001010200000100
,
0x0001010200000101
,
0x0001010200000102
,
0x0001010200010101
,
0x0001010200010102
,
0x0001010200010200
,
0x0001010200010202
,
0x0001010200020001
,
0x0001010200020102
,
0x0001010201000000
,
0x0001010201000002
,
0x0001010201000100
,
0x0001010201000101
,
0x0001010201000200
,
0x0001010201000202
,
0x0001010201010001
,
0x0001010201010101
,
0x0001010201010102
,
0x0001010201010200
,
0x0001010201010201
,
0x0001010201020001
,
0x0001010201020100
,
0x0001010201020101
,
0x0001010201020200
,
0x0001010201020201
,
0x0001010201020202
,
0x0001010202000102
,
0x0001010202000202
,
0x0001010202010002
,
0x0001010202010101
,
0x0001010202020100
,
0x0001010202020201
,
0x0001020000010001
,
0x0001020000010102
,
0x0001020000020101
,
0x0001020001000001
,
0x0001020001000100
,
0x0001020001000102
,
0x0001020001000201
,
0x0001020001010000
,
0x0001020001010101
,
0x0001020001010200
,
0x0001020001010202
,
0x0001020001020000
,
0x0001020001020001
,
0x0001020001020100
,
0x0001020001020102
,
0x0001020001020201
,
0x0001020002000101
,
0x0001020002010001
,
0x0001020002010100
,
0x0001020002020101
,
0x0001020100010000
,
0x0001020100010002
,
0x0001020100010101
,
0x0001020100010202
,
0x0001020100020001
,
0x0001020100020101
,
0x0001020101000002
,
0x0001020101000100
,
0x0001020101000101
,
0x0001020101000200
,
0x0001020101010001
,
0x0001020101010100
,
0x0001020101010101
,
0x0001020101010102
,
0x0001020101010201
,
0x0001020101010202
,
0x0001020101020000
,
0x0001020101020101
,
0x0001020101020202
,
0x0001020102000201
,
0x0001020102010001
,
0x0001020102010002
,
0x0001020102010101
,
0x0001020102010200
,
0x0001020102020001
,
0x0001020102020102
,
0x0001020102020201
,
0x0001020200000201
,
0x0001020200010102
,
0x0001020200020100
,
0x0001020200020102
,
0x0001020201000100
,
0x0001020201000102
,
0x0001020201000201
,
0x0001020201010000
,
0x0001020201010002
,
0x0001020201010101
,
0x0001020201010200
,
0x0001020201020001
,
0x0001020201020102
,
0x0001020201020201
,
0x0001020202000101
,
0x0001020202010001
,
0x0001020202010102
,
0x0001020202010202
,
0x0002000000000000
,
0x0002000000000002
,
0x0002000000000200
,
0x0002000000000202
,
0x0002000000010101
,
0x0002000000020000
,
0x0002000000020002
,
0x0002000000020101
,
0x0002000000020200
,
0x0002000000020202
,
0x0002000001000101
,
0x0002000001010001
,
0x0002000001010201
,
0x0002000001020001
,
0x0002000001020101
,
0x0002000002000000
,
0x0002000002000002
,
0x0002000002000200
,
0x0002000002000202
,
0x0002000002010101
,
0x0002000002020000
,
0x0002000002020002
,
0x0002000002020101
,
0x0002000002020200
,
0x0002000002020202
,
0x0002000100000101
,
0x0002000100010001
,
0x0002000100010100
,
0x0002000100010201
,
0x0002000100020101
,
0x0002000101000002
,
0x0002000101000100
,
0x0002000101000201
,
0x0002000101010101
,
0x0002000101010200
,
0x0002000101010202
,
0x0002000101020001
,
0x0002000101020100
,
0x0002000101020101
,
0x0002000101020102
,
0x0002000102000101
,
0x0002000102010000
,
0x0002000102010102
,
0x0002000102010201
,
0x0002000102020101
,
0x0002000200000001
,
0x0002000200000200
,
0x0002000200000202
,
0x0002000200010001
,
0x0002000200010101
,
0x0002000200020000
,
0x0002000200020002
,
0x0002000200020200
,
0x0002000200020202
,
0x0002000201000101
,
0x0002000201010001
,
0x0002000201010102
,
0x0002000201010201
,
0x0002000201020101
,
0x0002000202000001
,
0x0002000202000200
,
0x0002000202000202
,
0x0002000202010001
,
0x0002000202010101
,
0x0002000202020000
,
0x0002000202020002
,
0x0002000202020200
,
0x0002000202020202
,
0x0002010000000101
,
0x0002010000010100
,
0x0002010000010102
,
0x0002010000010201
,
0x0002010000020101
,
0x0002010001000100
,
0x0002010001000101
,
0x0002010001000102
,
0x0002010001000201
,
0x0002010001010002
,
0x0002010001010101
,
0x0002010001010200
,
0x0002010001010202
,
0x0002010001020102
,
0x0002010002000101
,
0x0002010002010001
,
0x0002010002010100
,
0x0002010002010201
,
0x0002010002020001
,
0x0002010002020101
,
0x0002010100000201
,
0x0002010100010101
,
0x0002010100020001
,
0x0002010100020201
,
0x0002010101000000
,
0x0002010101000101
,
0x0002010101000200
,
0x0002010101010001
,
0x0002010101010100
,
0x0002010101010101
,
0x0002010101010201
,
0x0002010101020002
,
0x0002010101020101
,
0x0002010101020200
,
0x0002010102000201
,
0x0002010102010000
,
0x0002010102010100
,
0x0002010102010101
,
0x0002010102010200
,
0x0002010102010202
,
0x0002010102020001
,
0x0002010102020100
,
0x0002010102020102
,
0x0002010102020201
,
0x0002010200000101
,
0x0002010200010000
,
0x0002010200010002
,
0x0002010200010201
,
0x0002010200020101
,
0x0002010201000001
,
0x0002010201000201
,
0x0002010201010101
,
0x0002010201020000
,
0x0002010201020001
,
0x0002010201020201
,
0x0002010202000100
,
0x0002010202000102
,
0x0002010202010000
,
0x0002010202010202
,
0x0002020000000000
,
0x0002020000000002
,
0x0002020000000200
,
0x0002020000000202
,
0x0002020000010101
,
0x0002020000020000
,
0x0002020000020002
,
0x0002020000020200
,
0x0002020000020202
,
0x0002020001000101
,
0x0002020001010001
,
0x0002020001010100
,
0x0002020001020101
,
0x0002020002000000
,
0x0002020002000002
,
0x0002020002000200
,
0x0002020002000202
,
0x0002020002020000
,
0x0002020002020002
,
0x0002020002020200
,
0x0002020002020202
,
0x0002020100000201
,
0x0002020100010001
,
0x0002020100010100
,
0x0002020100010201
,
0x0002020100020101
,
0x0002020101000102
,
0x0002020101000201
,
0x0002020101010002
,
0x0002020101010101
,
0x0002020101020001
,
0x0002020101020100
,
0x0002020101020102
,
0x0002020101020201
,
0x0002020102000101
,
0x0002020102010000
,
0x0002020102010102
,
0x0002020102010201
,
0x0002020102020100
,
0x0002020102020101
,
0x0002020200000000
,
0x0002020200000002
,
0x0002020200000200
,
0x0002020200000202
,
0x0002020200020000
,
0x0002020200020002
,
0x0002020200020200
,
0x0002020200020202
,
0x0002020201000101
,
0x0002020201010001
,
0x0002020201010102
,
0x0002020201010201
,
0x0002020201020101
,
0x0002020202000000
,
0x0002020202000002
,
0x0002020202000200
,
0x0002020202000202
,
0x0002020202010101
,
0x0002020202020000
,
0x0002020202020002
,
0x0002020202020200
,
0x0002020202020202
,
0x0100000000000101
,
0x0100000000010001
,
0x0100000000010102
,
0x0100000000020101
,
0x0100000001000201
,
0x0100000001010002
,
0x0100000001010101
,
0x0100000001010200
,
0x0100000001010202
,
0x0100000001020001
,
0x0100000001020100
,
0x0100000001020102
,
0x0100000002010100
,
0x0100000002010201
,
0x0100000002020001
,
0x0100000002020102
,
0x0100000100000000
,
0x0100000100000001
,
0x0100000100000100
,
0x0100000100000102
,
0x0100000100000201
,
0x0100000100010002
,
0x0100000100010101
,
0x0100000100010102
,
0x0100000100010200
,
0x0100000100010202
,
0x0100000100020001
,
0x0100000100020102
,
0x0100000100020201
,
0x0100000101000101
,
0x0100000101000200
,
0x0100000101000202
,
0x0100000101010001
,
0x0100000101010100
,
0x0100000101010101
,
0x0100000101010102
,
0x0100000101010201
,
0x0100000101010202
,
0x0100000101020101
,
0x0100000101020200
,
0x0100000101020202
,
0x0100000102000001
,
0x0100000102000100
,
0x0100000102000102
,
0x0100000102010000
,
0x0100000102010002
,
0x0100000102010101
,
0x0100000102020000
,
0x0100000102020001
,
0x0100000102020002
,
0x0100000200000101
,
0x0100000200010001
,
0x0100000200010100
,
0x0100000200010102
,
0x0100000200020101
,
0x0100000201000001
,
0x0100000201010002
,
0x0100000201010101
,
0x0100000201010202
,
0x0100000201020100
,
0x0100000201020201
,
0x0100000202000201
,
0x0100000202010100
,
0x0100000202020101
,
0x0100010000000001
,
0x0100010000010101
,
0x0100010000010201
,
0x0100010000020201
,
0x0100010001000101
,
0x0100010001000200
,
0x0100010001000202
,
0x0100010001010001
,
0x0100010001010100
,
0x0100010001010101
,
0x0100010001010102
,
0x0100010001020001
,
0x0100010001020002
,
0x0100010001020101
,
0x0100010001020200
,
0x0100010001020202
,
0x0100010002000001
,
0x0100010002000102
,
0x0100010002000201
,
0x0100010002010000
,
0x0100010002010002
,
0x0100010002010101
,
0x0100010002020000
,
0x0100010002020001
,
0x0100010002020201
,
0x0100010100000001
,
0x0100010100000002
,
0x0100010100000101
,
0x0100010100000202
,
0x0100010100010001
,
0x0100010100010100
,
0x0100010100010101
,
0x0100010100010102
,
0x0100010100010201
,
0x0100010100020000
,
0x0100010100020101
,
0x0100010100020202
,
0x0100010101000001
,
0x0100010101000100
,
0x0100010101000101
,
0x0100010101000102
,
0x0100010101000201
,
0x0100010101010000
,
0x0100010101010001
,
0x0100010101010100
,
0x0100010101010101
,
0x0100010101010102
,
0x0100010101010200
,
0x0100010101010201
,
0x0100010101020001
,
0x0100010101020100
,
0x0100010101020101
,
0x0100010101020102
,
0x0100010101020201
,
0x0100010102000002
,
0x0100010102000100
,
0x0100010102000101
,
0x0100010102000200
,
0x0100010102010001
,
0x0100010102010100
,
0x0100010102010101
,
0x0100010102010102
,
0x0100010102010201
,
0x0100010102010202
,
0x0100010102020101
,
0x0100010102020200
,
0x0100010102020202
,
0x0100010200000001
,
0x0100010200000101
,
0x0100010200000201
,
0x0100010200010100
,
0x0100010200010101
,
0x0100010200010200
,
0x0100010200010202
,
0x0100010200020001
,
0x0100010200020100
,
0x0100010200020201
,
0x0100010201000000
,
0x0100010201000002
,
0x0100010201000101
,
0x0100010201000200
,
0x0100010201010000
,
0x0100010201010001
,
0x0100010201010002
,
0x0100010201010101
,
0x0100010201010102
,
0x0100010201010201
,
0x0100010201020002
,
0x0100010201020101
,
0x0100010201020200
,
0x0100010202000001
,
0x0100010202000101
,
0x0100010202000202
,
0x0100010202010100
,
0x0100010202010101
,
0x0100010202020001
,
0x0100010202020100
,
0x0100010202020102
,
0x0100020000000101
,
0x0100020000010001
,
0x0100020000010101
,
0x0100020000010202
,
0x0100020000020101
,
0x0100020001000002
,
0x0100020001000201
,
0x0100020001010000
,
0x0100020001010101
,
0x0100020001010200
,
0x0100020001020001
,
0x0100020001020100
,
0x0100020001020102
,
0x0100020001020201
,
0x0100020002000101
,
0x0100020002010001
,
0x0100020002010100
,
0x0100020002010102
,
0x0100020002010201
,
0x0100020002020101
,
0x0100020100000001
,
0x0100020100000101
,
0x0100020100000102
,
0x0100020100000202
,
0x0100020100010000
,
0x0100020100010100
,
0x0100020100010101
,
0x0100020100010200
,
0x0100020100020001
,
0x0100020100020100
,
0x0100020100020102
,
0x0100020101000000
,
0x0100020101000101
,
0x0100020101000202
,
0x0100020101010001
,
0x0100020101010002
,
0x0100020101010100
,
0x0100020101010101
,
0x0100020101010102
,
0x0100020101010201
,
0x0100020101020000
,
0x0100020101020002
,
0x0100020101020101
,
0x0100020101020102
,
0x0100020101020202
,
0x0100020102000102
,
0x0100020102000201
,
0x0100020102010002
,
0x0100020102010101
,
0x0100020102010102
,
0x0100020102010200
,
0x0100020102020001
,
0x0100020102020100
,
0x0100020102020102
,
0x0100020102020201
,
0x0100020200010102
,
0x0100020201000100
,
0x0100020201000102
,
0x0100020201000201
,
0x0100020201010101
,
0x0100020201010200
,
0x0100020201010202
,
0x0100020201020100
,
0x0100020201020201
,
0x0100020202010100
,
0x0100020202020101
,
0x0101000000000001
,
0x0101000000000100
,
0x0101000000000101
,
0x0101000000000102
,
0x0101000000000201
,
0x0101000000010002
,
0x0101000000010101
,
0x0101000000010202
,
0x0101000000020001
,
0x0101000000020100
,
0x0101000000020201
,
0x0101000001000000
,
0x0101000001000101
,
0x0101000001000200
,
0x0101000001010001
,
0x0101000001010100
,
0x0101000001010101
,
0x0101000001010102
,
0x0101000001010201
,
0x0101000001020101
,
0x0101000001020200
,
0x0101000002000102
,
0x0101000002000201
,
0x0101000002010101
,
0x0101000002010200
,
0x0101000002020000
,
0x0101000002020001
,
0x0101000002020102
,
0x0101000002020201
,
0x0101000100000101
,
0x0101000100000200
,
0x0101000100000201
,
0x0101000100000202
,
0x0101000100010001
,
0x0101000100010100
,
0x0101000100010101
,
0x0101000100010102
,
0x0101000100010200
,
0x0101000100010201
,
0x0101000100020000
,
0x0101000100020101
,
0x0101000100020102
,
0x0101000100020200
,
0x0101000100020202
,
0x0101000101000001
,
0x0101000101000100
,
0x0101000101000101
,
0x0101000101000102
,
0x0101000101000201
,
0x0101000101010000
,
0x0101000101010001
,
0x0101000101010002
,
0x0101000101010100
,
0x0101000101010101
,
0x0101000101010102
,
0x0101000101010200
,
0x0101000101010201
,
0x0101000101010202
,
0x0101000101020001
,
0x0101000101020100
,
0x0101000101020101
,
0x0101000101020102
,
0x0101000101020201
,
0x0101000102000002
,
0x0101000102000101
,
0x0101000102010001
,
0x0101000102010100
,
0x0101000102010101
,
0x0101000102010102
,
0x0101000102010201
,
0x0101000102020000
,
0x0101000102020101
,
0x0101000102020202
,
0x0101000200000001
,
0x0101000200000102
,
0x0101000200010002
,
0x0101000200010101
,
0x0101000200010202
,
0x0101000200020001
,
0x0101000200020100
,
0x0101000201000002
,
0x0101000201000101
,
0x0101000201000202
,
0x0101000201010001
,
0x0101000201010100
,
0x0101000201010101
,
0x0101000201010102
,
0x0101000201010201
,
0x0101000201020002
,
0x0101000201020101
,
0x0101000202000101
,
0x0101000202010000
,
0x0101000202010002
,
0x0101000202010101
,
0x0101000202010201
,
0x0101000202010202
,
0x0101000202020100
,
0x0101010000000100
,
0x0101010000000101
,
0x0101010000010001
,
0x0101010000010100
,
0x0101010000010101
,
0x0101010000010102
,
0x0101010000010200
,
0x0101010000010201
,
0x0101010000020001
,
0x0101010000020101
,
0x0101010000020200
,
0x0101010000020202
,
0x0101010001000001
,
0x0101010001000100
,
0x0101010001000101
,
0x0101010001000102
,
0x0101010001000201
,
0x0101010001000202
,
0x0101010001010000
,
0x0101010001010001
,
0x0101010001010100
,
0x0101010001010101
,
0x0101010001010102
,
0x0101010001010200
,
0x0101010001010201
,
0x0101010001010202
,
0x0101010001020001
,
0x0101010001020002
,
0x0101010001020100
,
0x0101010001020101
,
0x0101010001020102
,
0x0101010001020201
,
0x0101010002000000
,
0x0101010002000200
,
0x0101010002000202
,
0x0101010002010001
,
0x0101010002010100
,
0x0101010002010101
,
0x0101010002010102
,
0x0101010002010201
,
0x0101010002020001
,
0x0101010002020100
,
0x0101010002020101
,
0x0101010002020202
,
0x0101010100000001
,
0x0101010100000002
,
0x0101010100000100
,
0x0101010100000101
,
0x0101010100000102
,
0x0101010100000201
,
0x0101010100010000
,
0x0101010100010001
,
0x0101010100010002
,
0x0101010100010100
,
0x0101010100010101
,
0x0101010100010102
,
0x0101010100010201
,
0x0101010100010202
,
0x0101010100020001
,
0x0101010100020100
,
0x0101010100020101
,
0x0101010100020102
,
0x0101010100020201
,
0x0101010101000000
,
0x0101010101000001
,
0x0101010101000002
,
0x0101010101000100
,
0x0101010101000101
,
0x0101010101000102
,
0x0101010101000200
,
0x0101010101000201
,
0x0101010101010000
,
0x0101010101010001
,
0x0101010101010002
,
0x0101010101010100
,
0x0101010101010101
,
0x0101010101010102
,
0x0101010101010200
,
0x0101010101010201
,
0x0101010101010202
,
0x0101010101020000
,
0x0101010101020001
,
0x0101010101020100
,
0x0101010101020101
,
0x0101010101020102
,
0x0101010101020200
,
0x0101010101020201
,
0x0101010101020202
,
0x0101010102000001
,
0x0101010102000100
,
0x0101010102000101
,
0x0101010102000201
,
0x0101010102000202
,
0x0101010102010000
,
0x0101010102010001
,
0x0101010102010100
,
0x0101010102010101
,
0x0101010102010102
,
0x0101010102010200
,
0x0101010102010201
,
0x0101010102020001
,
0x0101010102020100
,
0x0101010102020101
,
0x0101010102020102
,
0x0101010102020201
,
0x0101010200000000
,
0x0101010200000001
,
0x0101010200000002
,
0x0101010200000100
,
0x0101010200000102
,
0x0101010200000200
,
0x0101010200000201
,
0x0101010200010001
,
0x0101010200010100
,
0x0101010200010101
,
0x0101010200010200
,
0x0101010200010201
,
0x0101010200020000
,
0x0101010200020001
,
0x0101010200020002
,
0x0101010200020100
,
0x0101010200020101
,
0x0101010200020102
,
0x0101010200020200
,
0x0101010200020201
,
0x0101010201000001
,
0x0101010201000101
,
0x0101010201000102
,
0x0101010201000200
,
0x0101010201000201
,
0x0101010201000202
,
0x0101010201010000
,
0x0101010201010001
,
0x0101010201010002
,
0x0101010201010100
,
0x0101010201010101
,
0x0101010201010102
,
0x0101010201010200
,
0x0101010201010201
,
0x0101010201010202
,
0x0101010201020001
,
0x0101010201020100
,
0x0101010201020101
,
0x0101010201020201
,
0x0101010202000002
,
0x0101010202000101
,
0x0101010202000102
,
0x0101010202000200
,
0x0101010202000201
,
0x0101010202000202
,
0x0101010202010001
,
0x0101010202010101
,
0x0101010202010202
,
0x0101010202020002
,
0x0101010202020101
,
0x0101010202020102
,
0x0101010202020200
,
0x0101010202020201
,
0x0101020000000100
,
0x0101020000000101
,
0x0101020000000102
,
0x0101020000000201
,
0x0101020000010000
,
0x0101020000010101
,
0x0101020000010200
,
0x0101020000020001
,
0x0101020000020202
,
0x0101020001000101
,
0x0101020001000200
,
0x0101020001000202
,
0x0101020001010001
,
0x0101020001010100
,
0x0101020001010101
,
0x0101020001010102
,
0x0101020001010200
,
0x0101020001010201
,
0x0101020001020000
,
0x0101020001020002
,
0x0101020001020100
,
0x0101020001020101
,
0x0101020002000002
,
0x0101020002000201
,
0x0101020002010000
,
0x0101020002010002
,
0x0101020002010101
,
0x0101020002010200
,
0x0101020002020001
,
0x0101020002020201
,
0x0101020100000001
,
0x0101020100000002
,
0x0101020100000101
,
0x0101020100000202
,
0x0101020100010001
,
0x0101020100010100
,
0x0101020100010101
,
0x0101020100010102
,
0x0101020100010201
,
0x0101020100020101
,
0x0101020101000001
,
0x0101020101000100
,
0x0101020101000101
,
0x0101020101000102
,
0x0101020101000201
,
0x0101020101010000
,
0x0101020101010001
,
0x0101020101010002
,
0x0101020101010100
,
0x0101020101010101
,
0x0101020101010102
,
0x0101020101010200
,
0x0101020101010201
,
0x0101020101010202
,
0x0101020101020001
,
0x0101020101020100
,
0x0101020101020101
,
0x0101020101020102
,
0x0101020101020201
,
0x0101020102000001
,
0x0101020102000101
,
0x0101020102000201
,
0x0101020102010001
,
0x0101020102010100
,
0x0101020102010101
,
0x0101020102010102
,
0x0101020102010200
,
0x0101020102010201
,
0x0101020102020101
,
0x0101020200000100
,
0x0101020200000200
,
0x0101020200010101
,
0x0101020200010202
,
0x0101020200020000
,
0x0101020200020101
,
0x0101020200020102
,
0x0101020200020201
,
0x0101020201000101
,
0x0101020201000200
,
0x0101020201000201
,
0x0101020201010001
,
0x0101020201010101
,
0x0101020201010102
,
0x0101020201010200
,
0x0101020201010201
,
0x0101020201020002
,
0x0101020201020101
,
0x0101020201020200
,
0x0101020201020202
,
0x0101020202000001
,
0x0101020202000202
,
0x0101020202010002
,
0x0101020202010101
,
0x0101020202010102
,
0x0101020202010200
,
0x0101020202010202
,
0x0101020202020001
,
0x0102000000000101
,
0x0102000000010100
,
0x0102000000010102
,
0x0102000000010201
,
0x0102000000020101
,
0x0102000001000100
,
0x0102000001010000
,
0x0102000001010101
,
0x0102000001010102
,
0x0102000001010200
,
0x0102000001010202
,
0x0102000001020001
,
0x0102000001020100
,
0x0102000001020102
,
0x0102000001020201
,
0x0102000002000001
,
0x0102000002010102
,
0x0102000002020101
,
0x0102000100000001
,
0x0102000100000100
,
0x0102000100000102
,
0x0102000100000201
,
0x0102000100010002
,
0x0102000100010101
,
0x0102000100020001
,
0x0102000100020002
,
0x0102000100020102
,
0x0102000100020201
,
0x0102000101000101
,
0x0102000101000201
,
0x0102000101010001
,
0x0102000101010101
,
0x0102000101010102
,
0x0102000101010201
,
0x0102000101020101
,
0x0102000101020102
,
0x0102000101020202
,
0x0102000102000100
,
0x0102000102000202
,
0x0102000102010002
,
0x0102000102010101
,
0x0102000102020001
,
0x0102000102020102
,
0x0102000102020201
,
0x0102000200010001
,
0x0102000200010102
,
0x0102000200010201
,
0x0102000201000000
,
0x0102000201000001
,
0x0102000201000102
,
0x0102000201010101
,
0x0102000201010102
,
0x0102000201010200
,
0x0102000201020000
,
0x0102000202000101
,
0x0102000202010001
,
0x0102000202010102
,
0x0102000202020101
,
0x0102010000010001
,
0x0102010000010002
,
0x0102010000010101
,
0x0102010000010102
,
0x0102010000010202
,
0x0102010000020001
,
0x0102010000020102
,
0x0102010000020201
,
0x0102010001000000
,
0x0102010001000002
,
0x0102010001000101
,
0x0102010001000200
,
0x0102010001000202
,
0x0102010001010001
,
0x0102010001010100
,
0x0102010001010101
,
0x0102010001010102
,
0x0102010001010201
,
0x0102010001010202
,
0x0102010001020000
,
0x0102010001020002
,
0x0102010001020101
,
0x0102010002000100
,
0x0102010002000101
,
0x0102010002000201
,
0x0102010002010000
,
0x0102010002010002
,
0x0102010002010100
,
0x0102010002010101
,
0x0102010002010102
,
0x0102010002010200
,
0x0102010002010202
,
0x0102010002020001
,
0x0102010002020100
,
0x0102010002020201
,
0x0102010100000101
,
0x0102010100000200
,
0x0102010100000202
,
0x0102010100010001
,
0x0102010100010101
,
0x0102010100010102
,
0x0102010100010201
,
0x0102010101000100
,
0x0102010101000101
,
0x0102010101000102
,
0x0102010101000201
,
0x0102010101010000
,
0x0102010101010001
,
0x0102010101010100
,
0x0102010101010101
,
0x0102010101010102
,
0x0102010101010201
,
0x0102010101020001
,
0x0102010101020100
,
0x0102010101020101
,
0x0102010101020102
,
0x0102010101020201
,
0x0102010102000102
,
0x0102010102000201
,
0x0102010102000202
,
0x0102010102010001
,
0x0102010102010101
,
0x0102010102010102
,
0x0102010102010201
,
0x0102010102010202
,
0x0102010102020002
,
0x0102010102020101
,
0x0102010102020102
,
0x0102010102020200
,
0x0102010200000002
,
0x0102010200000201
,
0x0102010200010101
,
0x0102010200020000
,
0x0102010200020102
,
0x0102010200020200
,
0x0102010200020201
,
0x0102010201000000
,
0x0102010201000101
,
0x0102010201000200
,
0x0102010201000202
,
0x0102010201010001
,
0x0102010201010100
,
0x0102010201010101
,
0x0102010201010102
,
0x0102010201010200
,
0x0102010201010202
,
0x0102010201020000
,
0x0102010201020101
,
0x0102010201020200
,
0x0102010202000000
,
0x0102010202000002
,
0x0102010202000101
,
0x0102010202000202
,
0x0102010202010100
,
0x0102010202010102
,
0x0102010202010200
,
0x0102010202010201
,
0x0102010202020000
,
0x0102010202020100
,
0x0102010202020102
,
0x0102010202020202
,
0x0102020000010102
,
0x0102020000010201
,
0x0102020000020101
,
0x0102020001000001
,
0x0102020001010002
,
0x0102020001010101
,
0x0102020001010202
,
0x0102020001020001
,
0x0102020001020201
,
0x0102020002000101
,
0x0102020002010001
,
0x0102020002010200
,
0x0102020002020102
,
0x0102020100000001
,
0x0102020100000100
,
0x0102020100010000
,
0x0102020100010101
,
0x0102020100020001
,
0x0102020100020100
,
0x0102020100020102
,
0x0102020100020201
,
0x0102020101000000
,
0x0102020101000001
,
0x0102020101000101
,
0x0102020101000102
,
0x0102020101000200
,
0x0102020101010001
,
0x0102020101010100
,
0x0102020101010101
,
0x0102020101010102
,
0x0102020101010201
,
0x0102020101020000
,
0x0102020101020101
,
0x0102020101020202
,
0x0102020102000002
,
0x0102020102000100
,
0x0102020102000202
,
0x0102020102010101
,
0x0102020102020001
,
0x0102020102020100
,
0x0102020102020101
,
0x0102020102020201
,
0x0102020200010001
,
0x0102020200010102
,
0x0102020200010200
,
0x0102020201000001
,
0x0102020201000100
,
0x0102020201000201
,
0x0102020201010000
,
0x0102020201010101
,
0x0102020201010200
,
0x0102020201010202
,
0x0102020201020100
,
0x0102020201020101
,
0x0102020201020201
,
0x0102020202000102
,
0x0102020202010100
,
0x0102020202010200
,
0x0102020202010202
,
0x0102020202020102
,
0x0200000000000000
,
0x0200000000000002
,
0x0200000000000200
,
0x0200000000000202
,
0x0200000000020000
,
0x0200000000020002
,
0x0200000000020200
,
0x0200000000020202
,
0x0200000001000101
,
0x0200000001010000
,
0x0200000001010001
,
0x0200000001010100
,
0x0200000001010102
,
0x0200000001010201
,
0x0200000001020101
,
0x0200000002000000
,
0x0200000002000002
,
0x0200000002000200
,
0x0200000002000202
,
0x0200000002010101
,
0x0200000002020000
,
0x0200000002020002
,
0x0200000002020200
,
0x0200000002020202
,
0x0200000100000101
,
0x0200000100010001
,
0x0200000100010100
,
0x0200000100010102
,
0x0200000100010201
,
0x0200000100020101
,
0x0200000101000001
,
0x0200000101000100
,
0x0200000101000201
,
0x0200000101010000
,
0x0200000101010002
,
0x0200000101010101
,
0x0200000101010102
,
0x0200000101010200
,
0x0200000101010201
,
0x0200000101020100
,
0x0200000101020102
,
0x0200000101020201
,
0x0200000102000101
,
0x0200000102000201
,
0x0200000102010100
,
0x0200000102010102
,
0x0200000102010201
,
0x0200000102020101
,
0x0200000200000000
,
0x0200000200000002
,
0x0200000200000200
,
0x0200000200000202
,
0x0200000200010101
,
0x0200000200020000
,
0x0200000200020002
,
0x0200000200020200
,
0x0200000200020202
,
0x0200000201010001
,
0x0200000201010100
,
0x0200000201010201
,
0x0200000201020101
,
0x0200000202000000
,
0x0200000202000002
,
0x0200000202000200
,
0x0200000202000202
,
0x0200000202010101
,
0x0200000202020000
,
0x0200000202020002
,
0x0200000202020200
,
0x0200000202020202
,
0x0200010000010100
,
0x0200010000010201
,
0x0200010001000001
,
0x0200010001000100
,
0x0200010001010001
,
0x0200010001010101
,
0x0200010001010202
,
0x0200010001020001
,
0x0200010001020100
,
0x0200010001020201
,
0x0200010002010100
,
0x0200010002010201
,
0x0200010100000001
,
0x0200010100000201
,
0x0200010100010002
,
0x0200010100010101
,
0x0200010100010202
,
0x0200010100020102
,
0x0200010100020201
,
0x0200010101000000
,
0x0200010101000001
,
0x0200010101000101
,
0x0200010101000200
,
0x0200010101010001
,
0x0200010101010100
,
0x0200010101010101
,
0x0200010101010102
,
0x0200010101010201
,
0x0200010101010202
,
0x0200010101020101
,
0x0200010101020102
,
0x0200010101020200
,
0x0200010101020202
,
0x0200010102000001
,
0x0200010102000100
,
0x0200010102000102
,
0x0200010102000201
,
0x0200010102010000
,
0x0200010102010002
,
0x0200010102010101
,
0x0200010102010200
,
0x0200010102020102
,
0x0200010200010001
,
0x0200010200010102
,
0x0200010200010201
,
0x0200010200020101
,
0x0200010201000001
,
0x0200010201000100
,
0x0200010201000201
,
0x0200010201000202
,
0x0200010201010000
,
0x0200010201010101
,
0x0200010201010201
,
0x0200010201010202
,
0x0200010201020001
,
0x0200010201020102
,
0x0200010201020202
,
0x0200010202000101
,
0x0200010202010001
,
0x0200010202010202
,
0x0200010202020100
,
0x0200020000000000
,
0x0200020000000002
,
0x0200020000000200
,
0x0200020000000202
,
0x0200020000010101
,
0x0200020000020000
,
0x0200020000020002
,
0x0200020000020200
,
0x0200020000020202
,
0x0200020001000001
,
0x0200020001000101
,
0x0200020001010001
,
0x0200020001010100
,
0x0200020001010201
,
0x0200020001020101
,
0x0200020001020201
,
0x0200020002000000
,
0x0200020002000002
,
0x0200020002000200
,
0x0200020002000202
,
0x0200020002010101
,
0x0200020002020000
,
0x0200020002020002
,
0x0200020002020200
,
0x0200020002020202
,
0x0200020100000101
,
0x0200020100000102
,
0x0200020100010001
,
0x0200020100010100
,
0x0200020100010102
,
0x0200020100020101
,
0x0200020101000001
,
0x0200020101000100
,
0x0200020101000102
,
0x0200020101000201
,
0x0200020101010000
,
0x0200020101010002
,
0x0200020101010101
,
0x0200020101010202
,
0x0200020101020001
,
0x0200020101020100
,
0x0200020102000101
,
0x0200020102010102
,
0x0200020102010201
,
0x0200020102020101
,
0x0200020200000000
,
0x0200020200000002
,
0x0200020200000200
,
0x0200020200000202
,
0x0200020200010101
,
0x0200020200020000
,
0x0200020200020002
,
0x0200020200020200
,
0x0200020200020202
,
0x0200020201000101
,
0x0200020201010001
,
0x0200020201010100
,
0x0200020201010102
,
0x0200020202000000
,
0x0200020202000002
,
0x0200020202000200
,
0x0200020202000202
,
0x0200020202010101
,
0x0200020202020000
,
0x0200020202020002
,
0x0200020202020200
,
0x0200020202020202
,
0x0201000000000101
,
0x0201000000010001
,
0x0201000000010102
,
0x0201000000010200
,
0x0201000000010201
,
0x0201000000020101
,
0x0201000001000001
,
0x0201000001000102
,
0x0201000001000201
,
0x0201000001010101
,
0x0201000001010200
,
0x0201000001010202
,
0x0201000001020201
,
0x0201000001020202
,
0x0201000002000101
,
0x0201000002010001
,
0x0201000002010100
,
0x0201000002010102
,
0x0201000002010201
,
0x0201000002020101
,
0x0201000100000001
,
0x0201000100000100
,
0x0201000100000102
,
0x0201000100000201
,
0x0201000100010000
,
0x0201000100010101
,
0x0201000100010200
,
0x0201000100010202
,
0x0201000100020001
,
0x0201000100020100
,
0x0201000100020102
,
0x0201000100020201
,
0x0201000101000000
,
0x0201000101000101
,
0x0201000101010000
,
0x0201000101010001
,
0x0201000101010100
,
0x0201000101010101
,
0x0201000101010102
,
0x0201000101010201
,
0x0201000101020002
,
0x0201000101020101
,
0x0201000102000100
,
0x0201000102000102
,
0x0201000102010002
,
0x0201000102010101
,
0x0201000102010200
,
0x0201000102020001
,
0x0201000102020100
,
0x0201000102020102
,
0x0201000102020201
,
0x0201000200000101
,
0x0201000200010001
,
0x0201000200010100
,
0x0201000200010201
,
0x0201000200020101
,
0x0201000201000100
,
0x0201000201000102
,
0x0201000201000201
,
0x0201000201010000
,
0x0201000201010002
,
0x0201000201010101
,
0x0201000201010200
,
0x0201000201020102
,
0x0201000201020201
,
0x0201000202000101
,
0x0201000202010100
,
0x0201000202010102
,
0x0201000202020201
,
0x0201010000000001
,
0x0201010000000100
,
0x0201010000000102
,
0x0201010000010000
,
0x0201010000010101
,
0x0201010000010200
,
0x0201010000020102
,
0x0201010001000000
,
0x0201010001000202
,
0x0201010001010001
,
0x0201010001010100
,
0x0201010001010101
,
0x0201010001010102
,
0x0201010001010200
,
0x0201010001010201
,
0x0201010001020000
,
0x0201010001020001
,
0x0201010001020002
,
0x0201010001020101
,
0x0201010002000100
,
0x0201010002000102
,
0x0201010002010002
,
0x0201010002010100
,
0x0201010002010101
,
0x0201010002010200
,
0x0201010002020001
,
0x0201010002020201
,
0x0201010100000000
,
0x0201010100000101
,
0x0201010100000200
,
0x0201010100000202
,
0x0201010100010000
,
0x0201010100010001
,
0x0201010100010100
,
0x0201010100010101
,
0x0201010100010102
,
0x0201010100010201
,
0x0201010100020001
,
0x0201010100020101
,
0x0201010100020201
,
0x0201010100020202
,
0x0201010101000001
,
0x0201010101000100
,
0x0201010101000101
,
0x0201010101000102
,
0x0201010101000201
,
0x0201010101010000
,
0x0201010101010001
,
0x0201010101010002
,
0x0201010101010100
,
0x0201010101010101
,
0x0201010101010102
,
0x0201010101010200
,
0x0201010101010201
,
0x0201010101010202
,
0x0201010101020001
,
0x0201010101020100
,
0x0201010101020101
,
0x0201010101020102
,
0x0201010101020201
,
0x0201010102000001
,
0x0201010102000101
,
0x0201010102000200
,
0x0201010102010001
,
0x0201010102010002
,
0x0201010102010100
,
0x0201010102010101
,
0x0201010102010102
,
0x0201010102010201
,
0x0201010102010202
,
0x0201010102020000
,
0x0201010102020002
,
0x0201010102020101
,
0x0201010102020200
,
0x0201010102020202
,
0x0201010200000001
,
0x0201010200000100
,
0x0201010200010000
,
0x0201010200010101
,
0x0201010200010201
,
0x0201010200020000
,
0x0201010200020102
,
0x0201010200020201
,
0x0201010201000101
,
0x0201010201000200
,
0x0201010201000201
,
0x0201010201010001
,
0x0201010201010002
,
0x0201010201010101
,
0x0201010201010102
,
0x0201010201010201
,
0x0201010201020101
,
0x0201010201020200
,
0x0201010202000002
,
0x0201010202000100
,
0x0201010202000201
,
0x0201010202000202
,
0x0201010202010002
,
0x0201010202010100
,
0x0201010202010101
,
0x0201010202020100
,
0x0201010202020102
,
0x0201010202020201
,
0x0201020000000101
,
0x0201020000010102
,
0x0201020000010201
,
0x0201020000020101
,
0x0201020001000001
,
0x0201020001000102
,
0x0201020001010000
,
0x0201020001010002
,
0x0201020001010101
,
0x0201020001010102
,
0x0201020001010202
,
0x0201020001020100
,
0x0201020001020101
,
0x0201020002000101
,
0x0201020002010001
,
0x0201020002010102
,
0x0201020002010201
,
0x0201020002020101
,
0x0201020100000100
,
0x0201020100000102
,
0x0201020100000201
,
0x0201020100010000
,
0x0201020100010002
,
0x0201020100010101
,
0x0201020100010200
,
0x0201020100010202
,
0x0201020100020000
,
0x0201020100020001
,
0x0201020100020100
,
0x0201020100020102
,
0x0201020101000000
,
0x0201020101000002
,
0x0201020101000101
,
0x0201020101000200
,
0x0201020101000202
,
0x0201020101010001
,
0x0201020101010100
,
0x0201020101010101
,
0x0201020101010102
,
0x0201020101010201
,
0x0201020101020002
,
0x0201020101020101
,
0x0201020101020102
,
0x0201020101020202
,
0x0201020102000001
,
0x0201020102000100
,
0x0201020102010000
,
0x0201020102010002
,
0x0201020102010101
,
0x0201020102010202
,
0x0201020102020001
,
0x0201020102020102
,
0x0201020200000101
,
0x0201020200010101
,
0x0201020200020101
,
0x0201020201000100
,
0x0201020201000102
,
0x0201020201000201
,
0x0201020201010000
,
0x0201020201010101
,
0x0201020201010200
,
0x0201020201020001
,
0x0201020202000101
,
0x0201020202010001
,
0x0201020202010100
,
0x0201020202010101
,
0x0201020202010102
,
0x0202000000000000
,
0x0202000000000002
,
0x0202000000000200
,
0x0202000000000202
,
0x0202000000010101
,
0x0202000000020000
,
0x0202000000020002
,
0x0202000000020200
,
0x0202000000020202
,
0x0202000001000101
,
0x0202000001010001
,
0x0202000001010100
,
0x0202000001010102
,
0x0202000001010201
,
0x0202000002000000
,
0x0202000002000002
,
0x0202000002000200
,
0x0202000002000202
,
0x0202000002010101
,
0x0202000002020000
,
0x0202000002020002
,
0x0202000002020200
,
0x0202000002020202
,
0x0202000100000101
,
0x0202000100000201
,
0x0202000100010001
,
0x0202000100010100
,
0x0202000100010102
,
0x0202000100010201
,
0x0202000100010202
,
0x0202000101000102
,
0x0202000101000201
,
0x0202000101010001
,
0x0202000101010101
,
0x0202000101010200
,
0x0202000101010202
,
0x0202000101020001
,
0x0202000101020100
,
0x0202000102000101
,
0x0202000102010000
,
0x0202000102010002
,
0x0202000102010102
,
0x0202000102010201
,
0x0202000200000002
,
0x0202000200000200
,
0x0202000200000202
,
0x0202000200010000
,
0x0202000200010201
,
0x0202000200020002
,
0x0202000200020200
,
0x0202000200020202
,
0x0202000201000101
,
0x0202000201010001
,
0x0202000201010102
,
0x0202000201010201
,
0x0202000201020101
,
0x0202000202000000
,
0x0202000202000002
,
0x0202000202000200
,
0x0202000202000202
,
0x0202000202010101
,
0x0202000202020000
,
0x0202000202020002
,
0x0202000202020200
,
0x0202000202020202
,
0x0202010000010201
,
0x0202010000020101
,
0x0202010001000001
,
0x0202010001000100
,
0x0202010001010000
,
0x0202010001010100
,
0x0202010001010101
,
0x0202010001010200
,
0x0202010001010202
,
0x0202010001020001
,
0x0202010001020101
,
0x0202010001020102
,
0x0202010001020200
,
0x0202010001020201
,
0x0202010002000101
,
0x0202010100000102
,
0x0202010100000201
,
0x0202010100010000
,
0x0202010100010002
,
0x0202010100010101
,
0x0202010100010200
,
0x0202010100020102
,
0x0202010100020201
,
0x0202010101000002
,
0x0202010101000101
,
0x0202010101010001
,
0x0202010101010100
,
0x0202010101010101
,
0x0202010101010102
,
0x0202010101010201
,
0x0202010101020101
,
0x0202010101020202
,
0x0202010102000001
,
0x0202010102000100
,
0x0202010102000101
,
0x0202010102000102
,
0x0202010102000201
,
0x0202010102010002
,
0x0202010102010101
,
0x0202010102010200
,
0x0202010200000101
,
0x0202010200010001
,
0x0202010200010102
,
0x0202010200010202
,
0x0202010200020001
,
0x0202010200020101
,
0x0202010201000100
,
0x0202010201000102
,
0x0202010201000202
,
0x0202010201010002
,
0x0202010201010101
,
0x0202010201010102
,
0x0202010201010200
,
0x0202010201020000
,
0x0202010201020002
,
0x0202010202000102
,
0x0202010202010000
,
0x0202010202010101
,
0x0202010202010102
,
0x0202010202010201
,
0x0202010202020001
,
0x0202010202020100
,
0x0202010202020102
,
0x0202020000000000
,
0x0202020000000002
,
0x0202020000000200
,
0x0202020000000202
,
0x0202020000020000
,
0x0202020000020002
,
0x0202020000020200
,
0x0202020000020202
,
0x0202020001010001
,
0x0202020001010100
,
0x0202020001010102
,
0x0202020001010201
,
0x0202020002000000
,
0x0202020002000002
,
0x0202020002000200
,
0x0202020002000202
,
0x0202020002010101
,
0x0202020002020000
,
0x0202020002020002
,
0x0202020002020200
,
0x0202020002020202
,
0x0202020100000101
,
0x0202020100010100
,
0x0202020100010201
,
0x0202020100020001
,
0x0202020100020101
,
0x0202020101000001
,
0x0202020101010000
,
0x0202020101010101
,
0x0202020101010202
,
0x0202020101020001
,
0x0202020101020102
,
0x0202020101020201
,
0x0202020102010000
,
0x0202020102010102
,
0x0202020200000000
,
0x0202020200000002
,
0x0202020200000200
,
0x0202020200000202
,
0x0202020200020000
,
0x0202020200020002
,
0x0202020200020200
,
0x0202020200020202
,
0x0202020201010001
,
0x0202020201010100
,
0x0202020201010102
,
0x0202020202000000
,
0x0202020202000002
,
0x0202020202000200
,
0x0202020202000202
,
0x0202020202010101
,
0x0202020202020000
,
0x0202020202020002
,
0x0202020202020200
,
0x0202020202020202
,
};
#else
static
const
uint32_t
iq1s_grid_us
[
2048
]
=
{
0x00000000
,
0x00000002
,
0x00000101
,
0x00000200
,
0x00000202
,
0x00010001
,
0x00010101
,
0x00020000
,
0x00020002
,
0x00020200
,
0x00020202
,
0x01000101
,
0x01010001
,
0x01010100
,
0x01010102
,
0x01020101
,
0x02000000
,
0x02000002
,
0x02000200
,
0x02000202
,
0x02010101
,
0x02020000
,
0x02020002
,
0x02020200
,
0x02020202
,
0x00000110
,
0x00000111
,
0x00010011
,
0x00010110
,
0x00010112
,
0x00010211
,
0x00010212
,
0x00020111
,
0x01000011
,
0x01000112
,
0x01000211
,
0x01010012
,
0x01010111
,
0x01010212
,
0x01020011
,
0x01020110
,
0x01020112
,
0x01020210
,
0x02000111
,
0x02010011
,
0x02010110
,
0x02010112
,
0x02020111
,
0x00000020
,
0x00000022
,
0x00000220
,
0x00000222
,
0x00010121
,
0x00020020
,
0x00020022
,
0x00020220
,
0x00020222
,
0x01000121
,
0x01010021
,
0x01010221
,
0x01020120
,
0x01020221
,
0x02000020
,
0x02000022
,
0x02000220
,
0x02000222
,
0x02010021
,
0x02010121
,
0x02010221
,
0x02020020
,
0x02020022
,
0x02020220
,
0x02020222
,
0x00011001
,
0x00011100
,
0x00011102
,
0x00021101
,
0x01001001
,
0x01001201
,
0x01011101
,
0x01011202
,
0x01021100
,
0x01021101
,
0x02011001
,
0x02011201
,
0x02021101
,
0x00001011
,
0x00001110
,
0x00001111
,
0x00001112
,
0x00011111
,
0x00011210
,
0x00011212
,
0x00021211
,
0x01001010
,
0x01001111
,
0x01001212
,
0x01011010
,
0x01011011
,
0x01011110
,
0x01011111
,
0x01011112
,
0x01011211
,
0x01021010
,
0x01021012
,
0x01021111
,
0x01021210
,
0x01021212
,
0x02001011
,
0x02011011
,
0x02011111
,
0x02011210
,
0x02011212
,
0x02021011
,
0x02021110
,
0x02021111
,
0x02021112
,
0x02021211
,
0x00011120
,
0x00011221
,
0x01001021
,
0x01001120
,
0x01011020
,
0x01011022
,
0x01011121
,
0x01011220
,
0x01021020
,
0x01021021
,
0x01021122
,
0x01021221
,
0x02001121
,
0x02011021
,
0x02011120
,
0x02011221
,
0x00002000
,
0x00002002
,
0x00002200
,
0x00002202
,
0x00012101
,
0x00022000
,
0x00022002
,
0x00022200
,
0x00022202
,
0x01002101
,
0x01012001
,
0x01012102
,
0x01022101
,
0x02002000
,
0x02002002
,
0x02002200
,
0x02002202
,
0x02012101
,
0x02022000
,
0x02022002
,
0x02022200
,
0x02022202
,
0x00002111
,
0x00012011
,
0x00012110
,
0x00012211
,
0x00022110
,
0x00022111
,
0x01002011
,
0x01012010
,
0x01012011
,
0x01012111
,
0x01022011
,
0x01022110
,
0x01022211
,
0x02012011
,
0x02012110
,
0x02012112
,
0x02012211
,
0x02022111
,
0x00002020
,
0x00002022
,
0x00002220
,
0x00002222
,
0x00012121
,
0x00022020
,
0x00022022
,
0x00022220
,
0x00022222
,
0x01002121
,
0x01012021
,
0x01012221
,
0x01022021
,
0x01022121
,
0x02002020
,
0x02002022
,
0x02002121
,
0x02002220
,
0x02002222
,
0x02012121
,
0x02022020
,
0x02022022
,
0x02022220
,
0x02022222
,
0x00110000
,
0x00110001
,
0x00110100
,
0x00110201
,
0x00120100
,
0x00120101
,
0x01100001
,
0x01100100
,
0x01110000
,
0x01110101
,
0x01110200
,
0x01120001
,
0x01120100
,
0x01120101
,
0x01120201
,
0x02110001
,
0x02110100
,
0x02110102
,
0x02120001
,
0x02120101
,
0x00100011
,
0x00100110
,
0x00100112
,
0x00100211
,
0x00110010
,
0x00110012
,
0x00110111
,
0x00110210
,
0x00120011
,
0x00120110
,
0x00120211
,
0x01100111
,
0x01100212
,
0x01110010
,
0x01110011
,
0x01110012
,
0x01110110
,
0x01110111
,
0x01110112
,
0x01110211
,
0x01120010
,
0x01120111
,
0x02100110
,
0x02110012
,
0x02110111
,
0x02120011
,
0x02120110
,
0x00110021
,
0x00110120
,
0x00110122
,
0x00120121
,
0x01100020
,
0x01100122
,
0x01100221
,
0x01110022
,
0x01110121
,
0x01110220
,
0x01110222
,
0x01120120
,
0x01120122
,
0x02100121
,
0x02110021
,
0x02110120
,
0x02110122
,
0x02120121
,
0x00101001
,
0x00101102
,
0x00101201
,
0x00111100
,
0x00111101
,
0x00111200
,
0x00111201
,
0x00121001
,
0x00121102
,
0x01101001
,
0x01101101
,
0x01101102
,
0x01101200
,
0x01101202
,
0x01111001
,
0x01111100
,
0x01111101
,
0x01111102
,
0x01111201
,
0x01121002
,
0x01121101
,
0x01121200
,
0x02101100
,
0x02101201
,
0x02111000
,
0x02111100
,
0x02111101
,
0x02111200
,
0x02111201
,
0x02111202
,
0x02121001
,
0x02121100
,
0x02121101
,
0x02121201
,
0x00101012
,
0x00101111
,
0x00101212
,
0x00111011
,
0x00111110
,
0x00111111
,
0x00111112
,
0x00111211
,
0x00121010
,
0x00121012
,
0x00121111
,
0x00121210
,
0x00121212
,
0x01101011
,
0x01101110
,
0x01101111
,
0x01101112
,
0x01111011
,
0x01111012
,
0x01111110
,
0x01111111
,
0x01111112
,
0x01111211
,
0x01111212
,
0x01121011
,
0x01121110
,
0x01121111
,
0x01121112
,
0x01121211
,
0x02101010
,
0x02101012
,
0x02101110
,
0x02101111
,
0x02101210
,
0x02101212
,
0x02111010
,
0x02111011
,
0x02111110
,
0x02111111
,
0x02111112
,
0x02111211
,
0x02111212
,
0x02121010
,
0x02121012
,
0x02121111
,
0x00101021
,
0x00101120
,
0x00101121
,
0x00101122
,
0x00111121
,
0x00111122
,
0x00111220
,
0x00111222
,
0x00121021
,
0x00121122
,
0x01101020
,
0x01101022
,
0x01101120
,
0x01101121
,
0x01101220
,
0x01101222
,
0x01111021
,
0x01111121
,
0x01111122
,
0x01111220
,
0x01111221
,
0x01121021
,
0x01121120
,
0x01121121
,
0x01121220
,
0x01121221
,
0x01121222
,
0x02101122
,
0x02101222
,
0x02111022
,
0x02111121
,
0x02121120
,
0x02121221
,
0x00112001
,
0x00112102
,
0x00122101
,
0x01102001
,
0x01102100
,
0x01102102
,
0x01102201
,
0x01112000
,
0x01112101
,
0x01112200
,
0x01112202
,
0x01122000
,
0x01122001
,
0x01122100
,
0x01122102
,
0x01122201
,
0x02102101
,
0x02112001
,
0x02112100
,
0x02122101
,
0x00112010
,
0x00112012
,
0x00112111
,
0x00112212
,
0x00122011
,
0x00122111
,
0x01102012
,
0x01102110
,
0x01102111
,
0x01102210
,
0x01112011
,
0x01112110
,
0x01112111
,
0x01112112
,
0x01112211
,
0x01112212
,
0x01122010
,
0x01122111
,
0x01122212
,
0x02102211
,
0x02112011
,
0x02112012
,
0x02112111
,
0x02112210
,
0x02122011
,
0x02122112
,
0x02122211
,
0x00102221
,
0x00112122
,
0x00122120
,
0x00122122
,
0x01102120
,
0x01102122
,
0x01102221
,
0x01112020
,
0x01112022
,
0x01112121
,
0x01112220
,
0x01122021
,
0x01122122
,
0x01122221
,
0x02102121
,
0x02112021
,
0x02112122
,
0x02112222
,
0x00200000
,
0x00200002
,
0x00200200
,
0x00200202
,
0x00210101
,
0x00220000
,
0x00220002
,
0x00220101
,
0x00220200
,
0x00220202
,
0x01200101
,
0x01210001
,
0x01210201
,
0x01220001
,
0x01220101
,
0x02200000
,
0x02200002
,
0x02200200
,
0x02200202
,
0x02210101
,
0x02220000
,
0x02220002
,
0x02220101
,
0x02220200
,
0x02220202
,
0x00200111
,
0x00210011
,
0x00210110
,
0x00210211
,
0x00220111
,
0x01200012
,
0x01200110
,
0x01200211
,
0x01210111
,
0x01210210
,
0x01210212
,
0x01220011
,
0x01220110
,
0x01220111
,
0x01220112
,
0x02200111
,
0x02210010
,
0x02210112
,
0x02210211
,
0x02220111
,
0x00200021
,
0x00200220
,
0x00200222
,
0x00210021
,
0x00210121
,
0x00220020
,
0x00220022
,
0x00220220
,
0x00220222
,
0x01200121
,
0x01210021
,
0x01210122
,
0x01210221
,
0x01220121
,
0x02200021
,
0x02200220
,
0x02200222
,
0x02210021
,
0x02210121
,
0x02220020
,
0x02220022
,
0x02220220
,
0x02220222
,
0x00201101
,
0x00211100
,
0x00211102
,
0x00211201
,
0x00221101
,
0x01201100
,
0x01201101
,
0x01201102
,
0x01201201
,
0x01211002
,
0x01211101
,
0x01211200
,
0x01211202
,
0x01221102
,
0x02201101
,
0x02211001
,
0x02211100
,
0x02211201
,
0x02221001
,
0x02221101
,
0x00201211
,
0x00211111
,
0x00221011
,
0x00221211
,
0x01201010
,
0x01201111
,
0x01201210
,
0x01211011
,
0x01211110
,
0x01211111
,
0x01211211
,
0x01221012
,
0x01221111
,
0x01221210
,
0x02201211
,
0x02211010
,
0x02211110
,
0x02211111
,
0x02211210
,
0x02211212
,
0x02221011
,
0x02221110
,
0x02221112
,
0x02221211
,
0x00201121
,
0x00211020
,
0x00211022
,
0x00211221
,
0x00221121
,
0x01201021
,
0x01201221
,
0x01211121
,
0x01221020
,
0x01221021
,
0x01221221
,
0x02201120
,
0x02201122
,
0x02211020
,
0x02211222
,
0x00202000
,
0x00202002
,
0x00202200
,
0x00202202
,
0x00212101
,
0x00222000
,
0x00222002
,
0x00222200
,
0x00222202
,
0x01202101
,
0x01212001
,
0x01212100
,
0x01222101
,
0x02202000
,
0x02202002
,
0x02202200
,
0x02202202
,
0x02222000
,
0x02222002
,
0x02222200
,
0x02222202
,
0x00202211
,
0x00212011
,
0x00212110
,
0x00212211
,
0x00222111
,
0x01202112
,
0x01202211
,
0x01212012
,
0x01212111
,
0x01222011
,
0x01222110
,
0x01222112
,
0x01222211
,
0x02202111
,
0x02212010
,
0x02212112
,
0x02212211
,
0x02222110
,
0x02222111
,
0x00202020
,
0x00202022
,
0x00202220
,
0x00202222
,
0x00222020
,
0x00222022
,
0x00222220
,
0x00222222
,
0x01202121
,
0x01212021
,
0x01212122
,
0x01212221
,
0x01222121
,
0x02202020
,
0x02202022
,
0x02202220
,
0x02202222
,
0x02212121
,
0x02222020
,
0x02222022
,
0x02222220
,
0x02222222
,
0x10000101
,
0x10010001
,
0x10010102
,
0x10020101
,
0x11000201
,
0x11010002
,
0x11010101
,
0x11010200
,
0x11010202
,
0x11020001
,
0x11020100
,
0x11020102
,
0x12010100
,
0x12010201
,
0x12020001
,
0x12020102
,
0x10000010
,
0x10000011
,
0x10000110
,
0x10000112
,
0x10000211
,
0x10010012
,
0x10010111
,
0x10010112
,
0x10010210
,
0x10010212
,
0x10020011
,
0x10020112
,
0x10020211
,
0x11000111
,
0x11000210
,
0x11000212
,
0x11010011
,
0x11010110
,
0x11010111
,
0x11010112
,
0x11010211
,
0x11010212
,
0x11020111
,
0x11020210
,
0x11020212
,
0x12000011
,
0x12000110
,
0x12000112
,
0x12010010
,
0x12010012
,
0x12010111
,
0x12020010
,
0x12020011
,
0x12020012
,
0x10000121
,
0x10010021
,
0x10010120
,
0x10010122
,
0x10020121
,
0x11000021
,
0x11010022
,
0x11010121
,
0x11010222
,
0x11020120
,
0x11020221
,
0x12000221
,
0x12010120
,
0x12020121
,
0x10001001
,
0x10011101
,
0x10011201
,
0x10021201
,
0x11001101
,
0x11001200
,
0x11001202
,
0x11011001
,
0x11011100
,
0x11011101
,
0x11011102
,
0x11021001
,
0x11021002
,
0x11021101
,
0x11021200
,
0x11021202
,
0x12001001
,
0x12001102
,
0x12001201
,
0x12011000
,
0x12011002
,
0x12011101
,
0x12021000
,
0x12021001
,
0x12021201
,
0x10001011
,
0x10001012
,
0x10001111
,
0x10001212
,
0x10011011
,
0x10011110
,
0x10011111
,
0x10011112
,
0x10011211
,
0x10021010
,
0x10021111
,
0x10021212
,
0x11001011
,
0x11001110
,
0x11001111
,
0x11001112
,
0x11001211
,
0x11011010
,
0x11011011
,
0x11011110
,
0x11011111
,
0x11011112
,
0x11011210
,
0x11011211
,
0x11021011
,
0x11021110
,
0x11021111
,
0x11021112
,
0x11021211
,
0x12001012
,
0x12001110
,
0x12001111
,
0x12001210
,
0x12011011
,
0x12011110
,
0x12011111
,
0x12011112
,
0x12011211
,
0x12011212
,
0x12021111
,
0x12021210
,
0x12021212
,
0x10001021
,
0x10001121
,
0x10001221
,
0x10011120
,
0x10011121
,
0x10011220
,
0x10011222
,
0x10021021
,
0x10021120
,
0x10021221
,
0x11001020
,
0x11001022
,
0x11001121
,
0x11001220
,
0x11011020
,
0x11011021
,
0x11011022
,
0x11011121
,
0x11011122
,
0x11011221
,
0x11021022
,
0x11021121
,
0x11021220
,
0x12001021
,
0x12001121
,
0x12001222
,
0x12011120
,
0x12011121
,
0x12021021
,
0x12021120
,
0x12021122
,
0x10002101
,
0x10012001
,
0x10012101
,
0x10012202
,
0x10022101
,
0x11002002
,
0x11002201
,
0x11012000
,
0x11012101
,
0x11012200
,
0x11022001
,
0x11022100
,
0x11022102
,
0x11022201
,
0x12002101
,
0x12012001
,
0x12012100
,
0x12012102
,
0x12012201
,
0x12022101
,
0x10002011
,
0x10002111
,
0x10002112
,
0x10002212
,
0x10012010
,
0x10012110
,
0x10012111
,
0x10012210
,
0x10022011
,
0x10022110
,
0x10022112
,
0x11002010
,
0x11002111
,
0x11002212
,
0x11012011
,
0x11012012
,
0x11012110
,
0x11012111
,
0x11012112
,
0x11012211
,
0x11022010
,
0x11022012
,
0x11022111
,
0x11022112
,
0x11022212
,
0x12002112
,
0x12002211
,
0x12012012
,
0x12012111
,
0x12012112
,
0x12012210
,
0x12022011
,
0x12022110
,
0x12022112
,
0x12022211
,
0x10012122
,
0x11002120
,
0x11002122
,
0x11002221
,
0x11012121
,
0x11012220
,
0x11012222
,
0x11022120
,
0x11022221
,
0x12012120
,
0x12022121
,
0x10100001
,
0x10100100
,
0x10100101
,
0x10100102
,
0x10100201
,
0x10110002
,
0x10110101
,
0x10110202
,
0x10120001
,
0x10120100
,
0x10120201
,
0x11100000
,
0x11100101
,
0x11100200
,
0x11110001
,
0x11110100
,
0x11110101
,
0x11110102
,
0x11110201
,
0x11120101
,
0x11120200
,
0x12100102
,
0x12100201
,
0x12110101
,
0x12110200
,
0x12120000
,
0x12120001
,
0x12120102
,
0x12120201
,
0x10100111
,
0x10100210
,
0x10100211
,
0x10100212
,
0x10110011
,
0x10110110
,
0x10110111
,
0x10110112
,
0x10110210
,
0x10110211
,
0x10120010
,
0x10120111
,
0x10120112
,
0x10120210
,
0x10120212
,
0x11100011
,
0x11100110
,
0x11100111
,
0x11100112
,
0x11100211
,
0x11110010
,
0x11110011
,
0x11110012
,
0x11110110
,
0x11110111
,
0x11110112
,
0x11110210
,
0x11110211
,
0x11110212
,
0x11120011
,
0x11120110
,
0x11120111
,
0x11120112
,
0x11120211
,
0x12100012
,
0x12100111
,
0x12110011
,
0x12110110
,
0x12110111
,
0x12110112
,
0x12110211
,
0x12120010
,
0x12120111
,
0x12120212
,
0x10100021
,
0x10100122
,
0x10110022
,
0x10110121
,
0x10110222
,
0x10120021
,
0x10120120
,
0x11100022
,
0x11100121
,
0x11100222
,
0x11110021
,
0x11110120
,
0x11110121
,
0x11110122
,
0x11110221
,
0x11120022
,
0x11120121
,
0x12100121
,
0x12110020
,
0x12110022
,
0x12110121
,
0x12110221
,
0x12110222
,
0x12120120
,
0x10101100
,
0x10101101
,
0x10111001
,
0x10111100
,
0x10111101
,
0x10111102
,
0x10111200
,
0x10111201
,
0x10121001
,
0x10121101
,
0x10121200
,
0x10121202
,
0x11101001
,
0x11101100
,
0x11101101
,
0x11101102
,
0x11101201
,
0x11101202
,
0x11111000
,
0x11111001
,
0x11111100
,
0x11111101
,
0x11111102
,
0x11111200
,
0x11111201
,
0x11111202
,
0x11121001
,
0x11121002
,
0x11121100
,
0x11121101
,
0x11121102
,
0x11121201
,
0x12101000
,
0x12101200
,
0x12101202
,
0x12111001
,
0x12111100
,
0x12111101
,
0x12111102
,
0x12111201
,
0x12121001
,
0x12121100
,
0x12121101
,
0x12121202
,
0x10101011
,
0x10101012
,
0x10101110
,
0x10101111
,
0x10101112
,
0x10101211
,
0x10111010
,
0x10111011
,
0x10111012
,
0x10111110
,
0x10111111
,
0x10111112
,
0x10111211
,
0x10111212
,
0x10121011
,
0x10121110
,
0x10121111
,
0x10121112
,
0x10121211
,
0x11101010
,
0x11101011
,
0x11101012
,
0x11101110
,
0x11101111
,
0x11101112
,
0x11101210
,
0x11101211
,
0x11111010
,
0x11111011
,
0x11111012
,
0x11111110
,
0x11111111
,
0x11111112
,
0x11111210
,
0x11111211
,
0x11111212
,
0x11121010
,
0x11121011
,
0x11121110
,
0x11121111
,
0x11121112
,
0x11121210
,
0x11121211
,
0x11121212
,
0x12101011
,
0x12101110
,
0x12101111
,
0x12101211
,
0x12101212
,
0x12111010
,
0x12111011
,
0x12111110
,
0x12111111
,
0x12111112
,
0x12111210
,
0x12111211
,
0x12121011
,
0x12121110
,
0x12121111
,
0x12121112
,
0x12121211
,
0x10101020
,
0x10101021
,
0x10101022
,
0x10101120
,
0x10101122
,
0x10101220
,
0x10101221
,
0x10111021
,
0x10111120
,
0x10111121
,
0x10111220
,
0x10111221
,
0x10121020
,
0x10121021
,
0x10121022
,
0x10121120
,
0x10121121
,
0x10121122
,
0x10121220
,
0x10121221
,
0x11101021
,
0x11101121
,
0x11101122
,
0x11101220
,
0x11101221
,
0x11101222
,
0x11111020
,
0x11111021
,
0x11111022
,
0x11111120
,
0x11111121
,
0x11111122
,
0x11111220
,
0x11111221
,
0x11111222
,
0x11121021
,
0x11121120
,
0x11121121
,
0x11121221
,
0x12101022
,
0x12101121
,
0x12101122
,
0x12101220
,
0x12101221
,
0x12101222
,
0x12111021
,
0x12111121
,
0x12111222
,
0x12121022
,
0x12121121
,
0x12121122
,
0x12121220
,
0x12121221
,
0x10102100
,
0x10102101
,
0x10102102
,
0x10102201
,
0x10112000
,
0x10112101
,
0x10112200
,
0x10122001
,
0x10122202
,
0x11102101
,
0x11102200
,
0x11102202
,
0x11112001
,
0x11112100
,
0x11112101
,
0x11112102
,
0x11112200
,
0x11112201
,
0x11122000
,
0x11122002
,
0x11122100
,
0x11122101
,
0x12102002
,
0x12102201
,
0x12112000
,
0x12112002
,
0x12112101
,
0x12112200
,
0x12122001
,
0x12122201
,
0x10102011
,
0x10102012
,
0x10102111
,
0x10102212
,
0x10112011
,
0x10112110
,
0x10112111
,
0x10112112
,
0x10112211
,
0x10122111
,
0x11102011
,
0x11102110
,
0x11102111
,
0x11102112
,
0x11102211
,
0x11112010
,
0x11112011
,
0x11112012
,
0x11112110
,
0x11112111
,
0x11112112
,
0x11112210
,
0x11112211
,
0x11112212
,
0x11122011
,
0x11122110
,
0x11122111
,
0x11122112
,
0x11122211
,
0x12102011
,
0x12102111
,
0x12102211
,
0x12112011
,
0x12112110
,
0x12112111
,
0x12112112
,
0x12112210
,
0x12112211
,
0x12122111
,
0x10102120
,
0x10102220
,
0x10112121
,
0x10112222
,
0x10122020
,
0x10122121
,
0x10122122
,
0x10122221
,
0x11102121
,
0x11102220
,
0x11102221
,
0x11112021
,
0x11112121
,
0x11112122
,
0x11112220
,
0x11112221
,
0x11122022
,
0x11122121
,
0x11122220
,
0x11122222
,
0x12102021
,
0x12102222
,
0x12112022
,
0x12112121
,
0x12112122
,
0x12112220
,
0x12112222
,
0x12122021
,
0x10200101
,
0x10210100
,
0x10210102
,
0x10210201
,
0x10220101
,
0x11200100
,
0x11210000
,
0x11210101
,
0x11210102
,
0x11210200
,
0x11210202
,
0x11220001
,
0x11220100
,
0x11220102
,
0x11220201
,
0x12200001
,
0x12210102
,
0x12220101
,
0x10200011
,
0x10200110
,
0x10200112
,
0x10200211
,
0x10210012
,
0x10210111
,
0x10220011
,
0x10220012
,
0x10220112
,
0x10220211
,
0x11200111
,
0x11200211
,
0x11210011
,
0x11210111
,
0x11210112
,
0x11210211
,
0x11220111
,
0x11220112
,
0x11220212
,
0x12200110
,
0x12200212
,
0x12210012
,
0x12210111
,
0x12220011
,
0x12220112
,
0x12220211
,
0x10210021
,
0x10210122
,
0x10210221
,
0x11200020
,
0x11200021
,
0x11200122
,
0x11210121
,
0x11210122
,
0x11210220
,
0x11220020
,
0x12200121
,
0x12210021
,
0x12210122
,
0x12220121
,
0x10211001
,
0x10211002
,
0x10211101
,
0x10211102
,
0x10211202
,
0x10221001
,
0x10221102
,
0x10221201
,
0x11201000
,
0x11201002
,
0x11201101
,
0x11201200
,
0x11201202
,
0x11211001
,
0x11211100
,
0x11211101
,
0x11211102
,
0x11211201
,
0x11211202
,
0x11221000
,
0x11221002
,
0x11221101
,
0x12201100
,
0x12201101
,
0x12201201
,
0x12211000
,
0x12211002
,
0x12211100
,
0x12211101
,
0x12211102
,
0x12211200
,
0x12211202
,
0x12221001
,
0x12221100
,
0x12221201
,
0x10201111
,
0x10201210
,
0x10201212
,
0x10211011
,
0x10211111
,
0x10211112
,
0x10211211
,
0x11201110
,
0x11201111
,
0x11201112
,
0x11201211
,
0x11211010
,
0x11211011
,
0x11211110
,
0x11211111
,
0x11211112
,
0x11211211
,
0x11221011
,
0x11221110
,
0x11221111
,
0x11221112
,
0x11221211
,
0x12201112
,
0x12201211
,
0x12201212
,
0x12211011
,
0x12211111
,
0x12211112
,
0x12211211
,
0x12211212
,
0x12221012
,
0x12221111
,
0x12221112
,
0x12221210
,
0x10201022
,
0x10201221
,
0x10211121
,
0x10221020
,
0x10221122
,
0x10221220
,
0x10221221
,
0x11201020
,
0x11201121
,
0x11201220
,
0x11201222
,
0x11211021
,
0x11211120
,
0x11211121
,
0x11211122
,
0x11211220
,
0x11211222
,
0x11221020
,
0x11221121
,
0x11221220
,
0x12201020
,
0x12201022
,
0x12201121
,
0x12201222
,
0x12211120
,
0x12211122
,
0x12211220
,
0x12211221
,
0x12221020
,
0x12221120
,
0x12221122
,
0x12221222
,
0x10212102
,
0x10212201
,
0x10222101
,
0x11202001
,
0x11212002
,
0x11212101
,
0x11212202
,
0x11222001
,
0x11222201
,
0x12202101
,
0x12212001
,
0x12212200
,
0x12222102
,
0x10202011
,
0x10202110
,
0x10212010
,
0x10212111
,
0x10222011
,
0x10222110
,
0x10222112
,
0x10222211
,
0x11202010
,
0x11202011
,
0x11202111
,
0x11202112
,
0x11202210
,
0x11212011
,
0x11212110
,
0x11212111
,
0x11212112
,
0x11212211
,
0x11222010
,
0x11222111
,
0x11222212
,
0x12202012
,
0x12202110
,
0x12202212
,
0x12212111
,
0x12222011
,
0x12222110
,
0x12222111
,
0x12222211
,
0x10212021
,
0x10212122
,
0x10212220
,
0x11202021
,
0x11202120
,
0x11202221
,
0x11212020
,
0x11212121
,
0x11212220
,
0x11212222
,
0x11222120
,
0x11222121
,
0x11222221
,
0x12202122
,
0x12212120
,
0x12212220
,
0x12212222
,
0x12222122
,
0x20000000
,
0x20000002
,
0x20000200
,
0x20000202
,
0x20020000
,
0x20020002
,
0x20020200
,
0x20020202
,
0x21000101
,
0x21010000
,
0x21010001
,
0x21010100
,
0x21010102
,
0x21010201
,
0x21020101
,
0x22000000
,
0x22000002
,
0x22000200
,
0x22000202
,
0x22010101
,
0x22020000
,
0x22020002
,
0x22020200
,
0x22020202
,
0x20000111
,
0x20010011
,
0x20010110
,
0x20010112
,
0x20010211
,
0x20020111
,
0x21000011
,
0x21000110
,
0x21000211
,
0x21010010
,
0x21010012
,
0x21010111
,
0x21010112
,
0x21010210
,
0x21010211
,
0x21020110
,
0x21020112
,
0x21020211
,
0x22000111
,
0x22000211
,
0x22010110
,
0x22010112
,
0x22010211
,
0x22020111
,
0x20000020
,
0x20000022
,
0x20000220
,
0x20000222
,
0x20010121
,
0x20020020
,
0x20020022
,
0x20020220
,
0x20020222
,
0x21010021
,
0x21010120
,
0x21010221
,
0x21020121
,
0x22000020
,
0x22000022
,
0x22000220
,
0x22000222
,
0x22010121
,
0x22020020
,
0x22020022
,
0x22020220
,
0x22020222
,
0x20011100
,
0x20011201
,
0x21001001
,
0x21001100
,
0x21011001
,
0x21011101
,
0x21011202
,
0x21021001
,
0x21021100
,
0x21021201
,
0x22011100
,
0x22011201
,
0x20001011
,
0x20001211
,
0x20011012
,
0x20011111
,
0x20011212
,
0x20021112
,
0x20021211
,
0x21001010
,
0x21001011
,
0x21001111
,
0x21001210
,
0x21011011
,
0x21011110
,
0x21011111
,
0x21011112
,
0x21011211
,
0x21011212
,
0x21021111
,
0x21021112
,
0x21021210
,
0x21021212
,
0x22001011
,
0x22001110
,
0x22001112
,
0x22001211
,
0x22011010
,
0x22011012
,
0x22011111
,
0x22011210
,
0x22021112
,
0x20011021
,
0x20011122
,
0x20011221
,
0x20021121
,
0x21001021
,
0x21001120
,
0x21001221
,
0x21001222
,
0x21011020
,
0x21011121
,
0x21011221
,
0x21011222
,
0x21021021
,
0x21021122
,
0x21021222
,
0x22001121
,
0x22011021
,
0x22011222
,
0x22021120
,
0x20002000
,
0x20002002
,
0x20002200
,
0x20002202
,
0x20012101
,
0x20022000
,
0x20022002
,
0x20022200
,
0x20022202
,
0x21002001
,
0x21002101
,
0x21012001
,
0x21012100
,
0x21012201
,
0x21022101
,
0x21022201
,
0x22002000
,
0x22002002
,
0x22002200
,
0x22002202
,
0x22012101
,
0x22022000
,
0x22022002
,
0x22022200
,
0x22022202
,
0x20002111
,
0x20002112
,
0x20012011
,
0x20012110
,
0x20012112
,
0x20022111
,
0x21002011
,
0x21002110
,
0x21002112
,
0x21002211
,
0x21012010
,
0x21012012
,
0x21012111
,
0x21012212
,
0x21022011
,
0x21022110
,
0x22002111
,
0x22012112
,
0x22012211
,
0x22022111
,
0x20002020
,
0x20002022
,
0x20002220
,
0x20002222
,
0x20012121
,
0x20022020
,
0x20022022
,
0x20022220
,
0x20022222
,
0x21002121
,
0x21012021
,
0x21012120
,
0x21012122
,
0x22002020
,
0x22002022
,
0x22002220
,
0x22002222
,
0x22012121
,
0x22022020
,
0x22022022
,
0x22022220
,
0x22022222
,
0x20100101
,
0x20110001
,
0x20110102
,
0x20110200
,
0x20110201
,
0x20120101
,
0x21100001
,
0x21100102
,
0x21100201
,
0x21110101
,
0x21110200
,
0x21110202
,
0x21120201
,
0x21120202
,
0x22100101
,
0x22110001
,
0x22110100
,
0x22110102
,
0x22110201
,
0x22120101
,
0x20100011
,
0x20100110
,
0x20100112
,
0x20100211
,
0x20110010
,
0x20110111
,
0x20110210
,
0x20110212
,
0x20120011
,
0x20120110
,
0x20120112
,
0x20120211
,
0x21100010
,
0x21100111
,
0x21110010
,
0x21110011
,
0x21110110
,
0x21110111
,
0x21110112
,
0x21110211
,
0x21120012
,
0x21120111
,
0x22100110
,
0x22100112
,
0x22110012
,
0x22110111
,
0x22110210
,
0x22120011
,
0x22120110
,
0x22120112
,
0x22120211
,
0x20100121
,
0x20110021
,
0x20110120
,
0x20110221
,
0x20120121
,
0x21100120
,
0x21100122
,
0x21100221
,
0x21110020
,
0x21110022
,
0x21110121
,
0x21110220
,
0x21120122
,
0x21120221
,
0x22100121
,
0x22110120
,
0x22110122
,
0x22120221
,
0x20101001
,
0x20101100
,
0x20101102
,
0x20111000
,
0x20111101
,
0x20111200
,
0x20121102
,
0x21101000
,
0x21101202
,
0x21111001
,
0x21111100
,
0x21111101
,
0x21111102
,
0x21111200
,
0x21111201
,
0x21121000
,
0x21121001
,
0x21121002
,
0x21121101
,
0x22101100
,
0x22101102
,
0x22111002
,
0x22111100
,
0x22111101
,
0x22111200
,
0x22121001
,
0x22121201
,
0x20101010
,
0x20101111
,
0x20101210
,
0x20101212
,
0x20111010
,
0x20111011
,
0x20111110
,
0x20111111
,
0x20111112
,
0x20111211
,
0x20121011
,
0x20121111
,
0x20121211
,
0x20121212
,
0x21101011
,
0x21101110
,
0x21101111
,
0x21101112
,
0x21101211
,
0x21111010
,
0x21111011
,
0x21111012
,
0x21111110
,
0x21111111
,
0x21111112
,
0x21111210
,
0x21111211
,
0x21111212
,
0x21121011
,
0x21121110
,
0x21121111
,
0x21121112
,
0x21121211
,
0x22101011
,
0x22101111
,
0x22101210
,
0x22111011
,
0x22111012
,
0x22111110
,
0x22111111
,
0x22111112
,
0x22111211
,
0x22111212
,
0x22121010
,
0x22121012
,
0x22121111
,
0x22121210
,
0x22121212
,
0x20101021
,
0x20101120
,
0x20111020
,
0x20111121
,
0x20111221
,
0x20121020
,
0x20121122
,
0x20121221
,
0x21101121
,
0x21101220
,
0x21101221
,
0x21111021
,
0x21111022
,
0x21111121
,
0x21111122
,
0x21111221
,
0x21121121
,
0x21121220
,
0x22101022
,
0x22101120
,
0x22101221
,
0x22101222
,
0x22111022
,
0x22111120
,
0x22111121
,
0x22121120
,
0x22121122
,
0x22121221
,
0x20102101
,
0x20112102
,
0x20112201
,
0x20122101
,
0x21102001
,
0x21102102
,
0x21112000
,
0x21112002
,
0x21112101
,
0x21112102
,
0x21112202
,
0x21122100
,
0x21122101
,
0x22102101
,
0x22112001
,
0x22112102
,
0x22112201
,
0x22122101
,
0x20102110
,
0x20102112
,
0x20102211
,
0x20112010
,
0x20112012
,
0x20112111
,
0x20112210
,
0x20112212
,
0x20122010
,
0x20122011
,
0x20122110
,
0x20122112
,
0x21102010
,
0x21102012
,
0x21102111
,
0x21102210
,
0x21102212
,
0x21112011
,
0x21112110
,
0x21112111
,
0x21112112
,
0x21112211
,
0x21122012
,
0x21122111
,
0x21122112
,
0x21122212
,
0x22102011
,
0x22102110
,
0x22112010
,
0x22112012
,
0x22112111
,
0x22112212
,
0x22122011
,
0x22122112
,
0x20102121
,
0x20112121
,
0x20122121
,
0x21102120
,
0x21102122
,
0x21102221
,
0x21112020
,
0x21112121
,
0x21112220
,
0x21122021
,
0x22102121
,
0x22112021
,
0x22112120
,
0x22112121
,
0x22112122
,
0x20200000
,
0x20200002
,
0x20200200
,
0x20200202
,
0x20210101
,
0x20220000
,
0x20220002
,
0x20220200
,
0x20220202
,
0x21200101
,
0x21210001
,
0x21210100
,
0x21210102
,
0x21210201
,
0x22200000
,
0x22200002
,
0x22200200
,
0x22200202
,
0x22210101
,
0x22220000
,
0x22220002
,
0x22220200
,
0x22220202
,
0x20200111
,
0x20200211
,
0x20210011
,
0x20210110
,
0x20210112
,
0x20210211
,
0x20210212
,
0x21200112
,
0x21200211
,
0x21210011
,
0x21210111
,
0x21210210
,
0x21210212
,
0x21220011
,
0x21220110
,
0x22200111
,
0x22210010
,
0x22210012
,
0x22210112
,
0x22210211
,
0x20200022
,
0x20200220
,
0x20200222
,
0x20210020
,
0x20210221
,
0x20220022
,
0x20220220
,
0x20220222
,
0x21200121
,
0x21210021
,
0x21210122
,
0x21210221
,
0x21220121
,
0x22200020
,
0x22200022
,
0x22200220
,
0x22200222
,
0x22210121
,
0x22220020
,
0x22220022
,
0x22220220
,
0x22220222
,
0x20211201
,
0x20221101
,
0x21201001
,
0x21201100
,
0x21211000
,
0x21211100
,
0x21211101
,
0x21211200
,
0x21211202
,
0x21221001
,
0x21221101
,
0x21221102
,
0x21221200
,
0x21221201
,
0x22201101
,
0x20201112
,
0x20201211
,
0x20211010
,
0x20211012
,
0x20211111
,
0x20211210
,
0x20221112
,
0x20221211
,
0x21201012
,
0x21201111
,
0x21211011
,
0x21211110
,
0x21211111
,
0x21211112
,
0x21211211
,
0x21221111
,
0x21221212
,
0x22201011
,
0x22201110
,
0x22201111
,
0x22201112
,
0x22201211
,
0x22211012
,
0x22211111
,
0x22211210
,
0x20201121
,
0x20211021
,
0x20211122
,
0x20211222
,
0x20221021
,
0x20221121
,
0x21201120
,
0x21201122
,
0x21201222
,
0x21211022
,
0x21211121
,
0x21211122
,
0x21211220
,
0x21221020
,
0x21221022
,
0x22201122
,
0x22211020
,
0x22211121
,
0x22211122
,
0x22211221
,
0x22221021
,
0x22221120
,
0x22221122
,
0x20202000
,
0x20202002
,
0x20202200
,
0x20202202
,
0x20222000
,
0x20222002
,
0x20222200
,
0x20222202
,
0x21212001
,
0x21212100
,
0x21212102
,
0x21212201
,
0x22202000
,
0x22202002
,
0x22202200
,
0x22202202
,
0x22212101
,
0x22222000
,
0x22222002
,
0x22222200
,
0x22222202
,
0x20202111
,
0x20212110
,
0x20212211
,
0x20222011
,
0x20222111
,
0x21202011
,
0x21212010
,
0x21212111
,
0x21212212
,
0x21222011
,
0x21222112
,
0x21222211
,
0x22212010
,
0x22212112
,
0x20202020
,
0x20202022
,
0x20202220
,
0x20202222
,
0x20222020
,
0x20222022
,
0x20222220
,
0x20222222
,
0x21212021
,
0x21212120
,
0x21212122
,
0x22202020
,
0x22202022
,
0x22202220
,
0x22202222
,
0x22212121
,
0x22222020
,
0x22222022
,
0x22222220
,
0x22222222
,
};
#endif
#ifndef HAVE_FANCY_SIMD
const
uint64_t
keven_signs
[
128
]
=
{
const
uint64_t
keven_signs
[
128
]
=
{
0x0101010101010101
,
0xff010101010101ff
,
0xff0101010101ff01
,
0x010101010101ffff
,
0x0101010101010101
,
0xff010101010101ff
,
0xff0101010101ff01
,
0x010101010101ffff
,
0xff01010101ff0101
,
0x0101010101ff01ff
,
0x0101010101ffff01
,
0xff01010101ffffff
,
0xff01010101ff0101
,
0x0101010101ff01ff
,
0x0101010101ffff01
,
0xff01010101ffffff
,
...
@@ -181,31 +989,41 @@ const uint64_t keven_signs[128] = {
...
@@ -181,31 +989,41 @@ const uint64_t keven_signs[128] = {
0x01ffffffff010101
,
0xffffffffff0101ff
,
0xffffffffff01ff01
,
0x01ffffffff01ffff
,
0x01ffffffff010101
,
0xffffffffff0101ff
,
0xffffffffff01ff01
,
0x01ffffffff01ffff
,
0xffffffffffff0101
,
0x01ffffffffff01ff
,
0x01ffffffffffff01
,
0xffffffffffffffff
,
0xffffffffffff0101
,
0x01ffffffffff01ff
,
0x01ffffffffffff01
,
0xffffffffffffffff
,
};
};
#endif
}
}
bool
iqk_mul_mat
(
long
Nx
,
long
Ny
,
long
ne00
,
int
typeA
,
const
void
*
A
,
const
void
*
B
,
/* moonll change mulmat
add typeB and strideB
}*/
bool
iqk_mul_mat
(
long
Nx
,
long
Ny
,
long
ne00
,
int
typeA
,
const
void
*
A
,
long
strideA
,
int
typeB
,
const
void
*
B
,
long
strideB
,
float
*
C
,
long
stride_C
,
int
ith
,
int
nth
)
{
float
*
C
,
long
stride_C
,
int
ith
,
int
nth
)
{
MulMat
mm
;
MulMat
mm
;
int
row_size_q8
;
if
(
!
MulMat
::
set_mul_mat
(
typeA
,
ne00
,
mm
,
row_size_q8
,
Ny
))
{
if
(
!
MulMat
::
set_mul_mat
(
typeA
,
typeB
,
ne00
,
mm
,
Ny
))
{
return
false
;
return
false
;
}
}
auto
row_size_qx
=
ggml_row_size
((
ggml_type
)
typeA
,
ne00
);
size_t
row_size_qx
=
strideA
*
ggml_type_size
(
ggml_type
(
typeA
));
size_t
row_size_qy
=
strideB
*
ggml_type_size
(
ggml_type
(
typeB
));
auto
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
auto
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
auto
first_x
=
ith
*
nrc_x
;
auto
first_x
=
ith
*
nrc_x
;
if
(
first_x
+
nrc_x
>
Nx
)
nrc_x
=
Nx
-
first_x
;
if
(
first_x
+
nrc_x
>
Nx
)
nrc_x
=
Nx
-
first_x
;
DataInfo
info
{
C
+
first_x
,
(
const
char
*
)
B
,
(
size_t
)
stride_C
,
(
size_t
)
row_size_q
8
,
0
,
1
,
nullptr
,
0
};
DataInfo
info
{
C
+
first_x
,
(
const
char
*
)
B
,
(
size_t
)
stride_C
,
row_size_q
y
,
0
,
1
,
nullptr
,
0
};
mm
.
mul_mat_NxM
(
ne00
,
(
const
char
*
)
A
+
row_size_qx
*
first_x
,
row_size_qx
,
info
,
nrc_x
,
Ny
);
mm
.
mul_mat_NxM
(
ne00
,
(
const
char
*
)
A
+
row_size_qx
*
first_x
,
row_size_qx
,
info
,
nrc_x
,
Ny
);
return
true
;
return
true
;
}
}
bool
iqk_mul_mat_moe
(
long
Nx
,
long
Ny
,
long
ne00
,
int
ne11
,
int
typeA
,
const
void
*
A
,
const
void
*
B
,
bool
iqk_mul_mat_moe
(
long
Nx
,
long
Ny
,
long
ne00
,
int
ne11
,
int
typeA
,
const
void
*
A
,
const
void
*
B
,
float
*
C
,
long
nb1
,
long
nb2
,
const
void
*
vrow_mapping
,
int
ith
,
int
nth
)
{
float
*
C
,
long
nb1
,
long
nb2
,
const
void
*
vrow_mapping
,
int
ith
,
int
nth
)
{
const
mmid_row_mapping
*
row_mapping
=
(
const
mmid_row_mapping
*
)
vrow_mapping
;
const
mmid_row_mapping
*
row_mapping
=
(
const
mmid_row_mapping
*
)
vrow_mapping
;
...
@@ -213,9 +1031,11 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
...
@@ -213,9 +1031,11 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
MulMat
mm
;
MulMat
mm
;
int
row_size_q8
;
int
row_size_q8
;
/* moonll
if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
return false;
return false;
}
}
*/
int
row_size_qx
=
ggml_row_size
((
ggml_type
)
typeA
,
ne00
);
int
row_size_qx
=
ggml_row_size
((
ggml_type
)
typeA
,
ne00
);
int
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
int
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
int
first_x
=
ith
*
nrc_x
;
int
first_x
=
ith
*
nrc_x
;
...
@@ -233,6 +1053,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
...
@@ -233,6 +1053,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#define HAVE_FANCY_SIMD
#define HAVE_FANCY_SIMD
#endif
#endif
//#define HAVE_FANCY_SIMD
namespace
{
namespace
{
...
@@ -257,10 +1078,9 @@ template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
...
@@ -257,10 +1078,9 @@ template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
}
}
#ifdef HAVE_FANCY_SIMD
#ifdef HAVE_FANCY_SIMD
inline
__m512i
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm512_loadu_si512
((
const
__m512i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
inline
__m512i
load_quants64
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm512_loadu_si512
((
const
__m512i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
#else
inline
__m256i
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
#endif
#endif
inline
__m256i
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
inline
__m256i
load_bsums
(
int
iy
,
int
i
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
bsums
);
}
inline
__m256i
load_bsums
(
int
iy
,
int
i
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
bsums
);
}
inline
float
scale
(
int
iy
,
int
i
)
const
{
return
y
[
iy
][
i
]
.
d
;
}
inline
float
scale
(
int
iy
,
int
i
)
const
{
return
y
[
iy
][
i
]
.
d
;
}
...
@@ -353,6 +1173,23 @@ struct ScaleIQ4XS {
...
@@ -353,6 +1173,23 @@ struct ScaleIQ4XS {
const
__m128i
m32
=
_mm_set1_epi16
(
-
32
);
const
__m128i
m32
=
_mm_set1_epi16
(
-
32
);
};
};
struct
Scales8KBase
{
template
<
typename
Q8
>
inline
void
accum_mins
(
const
__m128i
&
mins128
,
const
Q8
&
q8
,
int
i
,
float
c
,
__m256
*
accd
)
const
{
const
__m256i
mins
=
MM256_SET_M128I
(
_mm_shuffle_epi8
(
mins128
,
shuffles
[
1
]),
_mm_shuffle_epi8
(
mins128
,
shuffles
[
0
]));
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
q8s
=
q8
.
load_bsums
(
iy
,
i
);
const
__m256i
prod
=
_mm256_madd_epi16
(
mins
,
q8s
);
accd
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
c
*
q8
.
scale
(
iy
,
i
)),
_mm256_cvtepi32_ps
(
prod
),
accd
[
iy
]);
}
}
inline
__m256i
shuffle
(
__m128i
mins
)
const
{
return
MM256_SET_M128I
(
_mm_shuffle_epi8
(
mins
,
shuffles
[
1
]),
_mm_shuffle_epi8
(
mins
,
shuffles
[
0
]));
}
const
__m128i
shuffles
[
2
]
=
{
_mm_set_epi32
(
0x07060706
,
0x05040504
,
0x03020302
,
0x01000100
),
_mm_set_epi32
(
0x0f0e0f0e
,
0x0d0c0d0c
,
0x0b0a0b0a
,
0x09080908
)};
};
template
<
typename
Block
>
template
<
typename
Block
>
struct
BaseDequantizer
{
struct
BaseDequantizer
{
BaseDequantizer
(
const
void
*
vx
,
size_t
bx
)
:
vx
(
vx
),
bx
(
bx
)
{}
BaseDequantizer
(
const
void
*
vx
,
size_t
bx
)
:
vx
(
vx
),
bx
(
bx
)
{}
...
@@ -367,6 +1204,16 @@ struct BaseDequantizer {
...
@@ -367,6 +1204,16 @@ struct BaseDequantizer {
float
d
;
float
d
;
};
};
__m128i
inline
load_iq4nl_values_128
()
{
static
const
uint8_t
kvalues_iq4nl
[
16
]
=
{
1
,
24
,
45
,
63
,
79
,
93
,
106
,
118
,
129
,
141
,
153
,
166
,
181
,
197
,
217
,
241
};
return
_mm_loadu_si128
((
const
__m128i
*
)
kvalues_iq4nl
);
}
__m256i
inline
load_iq4nl_values_256
()
{
auto
val128
=
load_iq4nl_values_128
();
return
MM256_SET_M128I
(
val128
,
val128
);
}
#ifdef HAVE_FANCY_SIMD
#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================
//====================================== Zen4 ==================================================
...
@@ -434,8 +1281,17 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
...
@@ -434,8 +1281,17 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
Scales8K
s8k
;
Scales8K
s8k
;
};
};
/*
moonll DequantizerIQ4XS
*/
__m512i
inline
load_iq4nl_values_512
()
{
auto
val256
=
load_iq4nl_values_256
();
return
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
val256
),
val256
,
1
);
}
struct
DequantizerIQ4XS
final
:
public
BaseDequantizer
<
block_iq4_xs
>
{
struct
DequantizerIQ4XS
final
:
public
BaseDequantizer
<
block_iq4_xs
>
{
DequantizerIQ4XS
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
),
values
(
load_values
())
{}
DequantizerIQ4XS
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
),
values
(
load_
iq4nl_
values
_512
())
{}
template
<
typename
Q8
>
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
,
__m512i
*
scales
)
{
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
,
__m512i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
...
@@ -444,14 +1300,10 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
...
@@ -444,14 +1300,10 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
s8k
.
accum_mins
(
scales128
,
q8
,
i
,
-
128.
f
*
d
,
accd
);
s8k
.
accum_mins
(
scales128
,
q8
,
i
,
-
128.
f
*
d
,
accd
);
auto
scales256
=
MM256_SET_M128I
(
scales128
,
scales128
);
auto
scales256
=
MM256_SET_M128I
(
scales128
,
scales128
);
auto
all_scales
=
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
scales256
),
scales256
,
1
);
auto
all_scales
=
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
scales256
),
scales256
,
1
);
scales
[
0
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
0
]);
scales
[
0
]
=
_mm512_shuffle_epi8
(
all_scales
,
shuffles
[
0
]);
scales
[
1
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
1
]);
scales
[
1
]
=
_mm512_shuffle_epi8
(
all_scales
,
shuffles
[
1
]);
}
scales
[
2
]
=
_mm512_shuffle_epi8
(
all_scales
,
shuffles
[
2
]);
static
__m512i
load_values
()
{
scales
[
3
]
=
_mm512_shuffle_epi8
(
all_scales
,
shuffles
[
3
]);
static
const
uint8_t
kvalues_iq4nl
[
16
]
=
{
1
,
24
,
45
,
63
,
79
,
93
,
106
,
118
,
129
,
141
,
153
,
166
,
181
,
197
,
217
,
241
};
auto
val128
=
_mm_loadu_si128
((
const
__m128i
*
)
kvalues_iq4nl
);
auto
val256
=
MM256_SET_M128I
(
val128
,
val128
);
return
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
val256
),
val256
,
1
);
}
}
inline
void
prepare
(
const
uint8_t
*
q4
)
{
inline
void
prepare
(
const
uint8_t
*
q4
)
{
bits
.
prepare64
(
q4
);
bits
.
prepare64
(
q4
);
...
@@ -467,11 +1319,17 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
...
@@ -467,11 +1319,17 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
}
}
Q4Bits
bits
;
Q4Bits
bits
;
Scales8K
s8k
;
Scales8K
Base
s8k
;
ScaleIQ4XS
siq4
;
ScaleIQ4XS
siq4
;
const
__m512i
values
;
const
__m512i
values
;
const
__m512i
permute1
=
_mm512_set_epi64
(
11
,
10
,
3
,
2
,
9
,
8
,
1
,
0
);
const
__m512i
permute1
=
_mm512_set_epi64
(
11
,
10
,
3
,
2
,
9
,
8
,
1
,
0
);
const
__m512i
permute2
=
_mm512_set_epi64
(
15
,
14
,
7
,
6
,
13
,
12
,
5
,
4
);
const
__m512i
permute2
=
_mm512_set_epi64
(
15
,
14
,
7
,
6
,
13
,
12
,
5
,
4
);
const
__m512i
shuffles
[
4
]
=
{
_mm512_inserti32x8
(
_mm512_set1_epi16
(
0x0100
),
_mm256_set1_epi16
(
0x0302
),
1
),
_mm512_inserti32x8
(
_mm512_set1_epi16
(
0x0504
),
_mm256_set1_epi16
(
0x0706
),
1
),
_mm512_inserti32x8
(
_mm512_set1_epi16
(
0x0908
),
_mm256_set1_epi16
(
0x0b0a
),
1
),
_mm512_inserti32x8
(
_mm512_set1_epi16
(
0x0d0c
),
_mm256_set1_epi16
(
0x0f0e
),
1
),
};
};
};
struct
HighBit5
{
struct
HighBit5
{
...
@@ -646,6 +1504,149 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
...
@@ -646,6 +1504,149 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
}
}
}
}
template
<
typename
Q8
>
inline
void
compute_block
(
int
iy
,
int
i
,
float
d
,
const
Q8
&
q8
,
const
__m512i
*
values
,
const
__m512i
*
scales
,
__m512
*
accd
)
{
const
__m512i
p1
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
values
[
0
],
q8
.
load_quants64
(
iy
,
i
,
0
));
const
__m512i
p2
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
values
[
1
],
q8
.
load_quants64
(
iy
,
i
,
1
));
const
__m512i
p3
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
values
[
2
],
q8
.
load_quants64
(
iy
,
i
,
2
));
const
__m512i
p4
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
values
[
3
],
q8
.
load_quants64
(
iy
,
i
,
3
));
auto
sumi
=
_mm512_dpwssd_epi32
(
_mm512_setzero_si512
(),
scales
[
0
],
_mm512_packs_epi32
(
p1
,
p2
));
sumi
=
_mm512_dpwssd_epi32
(
sumi
,
scales
[
1
],
_mm512_packs_epi32
(
p3
,
p4
));
accd
[
iy
]
=
_mm512_fmadd_ps
(
_mm512_set1_ps
(
d
*
q8
.
scale
(
iy
,
i
)),
_mm512_cvtepi32_ps
(
sumi
),
accd
[
iy
]);
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qX_K_q8_K_AVX512
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256
accm
[
nrc_y
];
__m512
accd
[
nrc_y
];
__m512i
scales
[
2
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm512_setzero_ps
();
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accm
[
iy
]
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
deq
.
new_block
(
i
,
q8
,
accm
,
scales
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
const
__m512i
p1
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
0
],
q8
.
load_quants64
(
iy
,
i
,
0
));
const
__m512i
p2
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
1
],
q8
.
load_quants64
(
iy
,
i
,
1
));
const
__m512i
p3
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
2
],
q8
.
load_quants64
(
iy
,
i
,
2
));
const
__m512i
p4
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
3
],
q8
.
load_quants64
(
iy
,
i
,
3
));
auto
sumi
=
_mm512_dpwssd_epi32
(
_mm512_setzero_si512
(),
scales
[
0
],
_mm512_packs_epi32
(
p1
,
p2
));
sumi
=
_mm512_dpwssd_epi32
(
sumi
,
scales
[
1
],
_mm512_packs_epi32
(
p3
,
p4
));
accd
[
iy
]
=
_mm512_fmadd_ps
(
_mm512_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
)),
_mm512_cvtepi32_ps
(
sumi
),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
sum256
=
_mm256_add_ps
(
_mm512_castps512_ps256
(
accd
[
iy
]),
_mm512_extractf32x8_ps
(
accd
[
iy
],
1
));
info
.
store
(
ix
,
iy
,
hsum_float_8
(
_mm256_add_ps
(
accm
[
iy
],
sum256
)));
}
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_iqX_k_q8_K_AVX512
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256
accm
[
nrc_y
];
__m512
accd
[
nrc_y
];
__m512i
scales
[
4
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm512_setzero_ps
();
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accm
[
iy
]
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
deq
.
new_block
(
i
,
q8
,
accm
,
scales
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
const
__m512i
p1
=
_mm512_maddubs_epi16
(
deq
.
bits
.
values
[
0
],
q8
.
load_quants64
(
iy
,
i
,
0
));
const
__m512i
p2
=
_mm512_maddubs_epi16
(
deq
.
bits
.
values
[
1
],
q8
.
load_quants64
(
iy
,
i
,
1
));
const
__m512i
p3
=
_mm512_maddubs_epi16
(
deq
.
bits
.
values
[
2
],
q8
.
load_quants64
(
iy
,
i
,
2
));
const
__m512i
p4
=
_mm512_maddubs_epi16
(
deq
.
bits
.
values
[
3
],
q8
.
load_quants64
(
iy
,
i
,
3
));
auto
sumi
=
_mm512_dpwssd_epi32
(
_mm512_dpwssd_epi32
(
_mm512_dpwssd_epi32
(
_mm512_dpwssd_epi32
(
_mm512_setzero_si512
(),
p1
,
scales
[
0
]),
p2
,
scales
[
1
]),
p3
,
scales
[
2
]),
p4
,
scales
[
3
]);
accd
[
iy
]
=
_mm512_fmadd_ps
(
_mm512_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
)),
_mm512_cvtepi32_ps
(
sumi
),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
sum256
=
_mm256_add_ps
(
_mm512_castps512_ps256
(
accd
[
iy
]),
_mm512_extractf32x8_ps
(
accd
[
iy
],
1
));
info
.
store
(
ix
,
iy
,
hsum_float_8
(
_mm256_add_ps
(
accm
[
iy
],
sum256
)));
}
}
}
template
<
typename
Dequantizer
>
static
void
mul_mat_qX_K_q8_K_AVX512_1
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
constexpr
int
k_nx
=
2
;
Q8
<
1
>
q8
(
info
);
Dequantizer
deq1
(
vx
,
bx
);
Dequantizer
deq2
(
vx
,
bx
);
Dequantizer
*
deq
[
k_nx
];
deq
[
0
]
=
&
deq1
;
deq
[
1
]
=
&
deq2
;
__m512i
scales
[
2
*
k_nx
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
auto
accd
=
_mm512_setzero_ps
();
auto
accm
=
_mm256_setzero_ps
();
for
(
int
kx
=
0
;
kx
<
k_nx
;
++
kx
)
deq
[
kx
]
->
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
/
k_nx
;
++
i
)
{
for
(
int
kx
=
0
;
kx
<
k_nx
;
++
kx
)
deq
[
kx
]
->
new_block
(
k_nx
*
i
+
kx
,
q8
,
&
accm
,
scales
+
2
*
kx
);
for
(
int
kx
=
0
;
kx
<
k_nx
;
++
kx
)
{
compute_block
(
0
,
k_nx
*
i
+
kx
,
deq
[
kx
]
->
d
,
q8
,
deq
[
kx
]
->
bits
.
values
,
scales
+
2
*
kx
,
&
accd
);
}
}
if
(
2
*
(
nb
/
2
)
<
nb
)
{
int
i0
=
2
*
(
nb
/
2
);
deq
[
0
]
->
new_block
(
i0
,
q8
,
&
accm
,
scales
);
compute_block
(
0
,
i0
,
deq
[
0
]
->
d
,
q8
,
deq
[
0
]
->
bits
.
values
,
scales
,
&
accd
);
}
auto
sum256
=
_mm256_add_ps
(
_mm512_castps512_ps256
(
accd
),
_mm512_extractf32x8_ps
(
accd
,
1
));
info
.
store
(
ix
,
0
,
hsum_float_8
(
_mm256_add_ps
(
accm
,
sum256
)));
}
}
#else
#else
// ===================================== Vanilla AVX2 =====================================
// ===================================== Vanilla AVX2 =====================================
...
@@ -724,17 +1725,8 @@ struct HighBit3 {
...
@@ -724,17 +1725,8 @@ struct HighBit3 {
__m256i
hbits
;
__m256i
hbits
;
};
};
inline
__m256i
get_scale_shuffle_8
(
int
i
)
{
return
_mm256_set1_epi16
((
2
*
i
)
|
((
2
*
i
+
1
)
<<
8
));
}
inline
void
set_scales_8
(
const
__m256i
&
all_scales
,
int
j
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
3
));
}
/*
template <typename Q8, typename Bits>
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
if (j == 0) {
if (j == 0) {
...
@@ -755,7 +1747,7 @@ inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i,
...
@@ -755,7 +1747,7 @@ inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i,
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
}
}
}
}
}
}
*/
struct
DequantizerQ4K
final
:
public
BaseDequantizer
<
block_q4_K
>
{
struct
DequantizerQ4K
final
:
public
BaseDequantizer
<
block_q4_K
>
{
DequantizerQ4K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
DequantizerQ4K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
...
@@ -889,22 +1881,8 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
...
@@ -889,22 +1881,8 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
const
__m256i
mh
=
_mm256_set1_epi8
(
0x30
);
const
__m256i
mh
=
_mm256_set1_epi8
(
0x30
);
};
};
inline
__m256i
get_scale_shuffle_16
(
int
i
)
{
static
const
uint8_t
k_shuffle
[
128
]
=
{
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
};
return
_mm256_loadu_si256
((
const
__m256i
*
)
k_shuffle
+
i
);
}
inline
void
set_scales_16
(
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
3
));
}
template
<
typename
Dequantizer
,
int
nrc_y
>
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qY_K_q8_K_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
static
void
mul_mat_qY_K_q8_K_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
...
@@ -1000,6 +1978,8 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
...
@@ -1000,6 +1978,8 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
}
}
#endif // Zen4 or vanilla AVX2
#endif // Zen4 or vanilla AVX2
//
//
// ============================== Legacy quants
// ============================== Legacy quants
//
//
...
@@ -1075,6 +2055,28 @@ struct ScaleHelperQ_0 {
...
@@ -1075,6 +2055,28 @@ struct ScaleHelperQ_0 {
template
<
typename
Q
>
inline
float
prepare1
(
const
Q
*
y
)
const
{
return
GGML_FP16_TO_FP32
(
y
->
d
);
}
template
<
typename
Q
>
inline
float
prepare1
(
const
Q
*
y
)
const
{
return
GGML_FP16_TO_FP32
(
y
->
d
);
}
template
<
typename
Q
>
inline
float
prepare1
(
float
d
,
const
Q
*
y
)
const
{
return
d
*
prepare1
(
y
);
}
template
<
typename
Q
>
inline
float
prepare1
(
float
d
,
const
Q
*
y
)
const
{
return
d
*
prepare1
(
y
);
}
};
};
template
<
int
min_value
>
struct
ScaleHelperQ_0_1
{
ggml_half
scales8
[
4
];
template
<
typename
Q
>
inline
__m256
prepare4
(
const
Q
*
y
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
scales8
[
j
]
=
y
[
j
]
.
d
;
auto
s4
=
_mm_cvtph_ps
(
_mm_loadl_epi64
((
const
__m128i
*
)
scales8
));
return
_mm256_set_m128
(
_mm_mul_ps
(
s4
,
min
),
s4
);
}
template
<
typename
Q
>
inline
__m256
prepare4
(
__m256
other_scales
,
const
Q
*
y
)
{
return
_mm_mul256_ps
(
other_scales
,
prepare4
<
Q
>
(
y
));
}
template
<
typename
Q
>
inline
std
::
pair
<
float
,
float
>
prepare1
(
const
Q
*
y
)
const
{
float
d
=
GGML_FP16_TO_FP32
(
y
->
d
);
return
std
::
make_pair
(
d
,
-
d
*
float
(
min_value
));
}
std
::
pair
<
float
,
float
>
inline
prepare1
(
const
std
::
pair
<
float
,
float
>&
dm
,
const
block_q8_1
*
y
)
const
{
return
std
::
make_pair
(
dm
.
first
*
GGML_FP16_TO_FP32
(
y
->
d
),
dm
.
second
*
GGML_FP16_TO_FP32
(
y
->
s
));
}
const
__m128
min
=
_mm_set1_ps
(
float
(
-
min_value
));
};
struct
ScaleHelperQ_1
{
struct
ScaleHelperQ_1
{
uint32_t
scales8
[
4
];
uint32_t
scales8
[
4
];
...
@@ -1235,6 +2237,12 @@ struct Q8_0_Dequantizer {
...
@@ -1235,6 +2237,12 @@ struct Q8_0_Dequantizer {
}
}
};
};
struct
Q8_0_1_Dequantizer
{
inline
__m256i
dequant
(
const
block_q8_0
*
x
)
const
{
return
_mm256_add_epi8
(
_mm256_set1_epi8
(
127
),
_mm256_loadu_si256
((
const
__m256i
*
)
x
->
qs
));
}
};
struct
Q4_0_Dequantizer
{
struct
Q4_0_Dequantizer
{
Dequantizer4bit
b4
;
Dequantizer4bit
b4
;
const
__m256i
m8
=
_mm256_set1_epi8
(
-
8
);
const
__m256i
m8
=
_mm256_set1_epi8
(
-
8
);
...
@@ -1320,6 +2328,11 @@ struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_
...
@@ -1320,6 +2328,11 @@ struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_
Q8_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
Q8_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK4_0
;
}
inline
static
int
block_size
()
{
return
QK4_0
;
}
};
};
struct
Q8_0_1_Unpacker
final
:
public
Q_Unpacker
<
block_q8_0
,
ScaleHelperQ_0_1
<
127
>
,
Q8_0_1_Dequantizer
>
{
Q8_0_1_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
// using Sum4T = Sum4TypeQ81;
inline
static
int
block_size
()
{
return
QK8_0
;
}
};
struct
Q4_0_Unpacker
final
:
public
Q_Unpacker
<
block_q4_0
,
ScaleHelperQ_0
,
Q4_0_Dequantizer
>
{
struct
Q4_0_Unpacker
final
:
public
Q_Unpacker
<
block_q4_0
,
ScaleHelperQ_0
,
Q4_0_Dequantizer
>
{
Q4_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
Q4_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK4_0
;
}
inline
static
int
block_size
()
{
return
QK4_0
;
}
...
@@ -1353,8 +2366,466 @@ void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
...
@@ -1353,8 +2366,466 @@ void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
}
}
}
}
/*
moonll
add some structs for DequantizerIQ2XXS
SimpleBits
EvenSignHelper
*/
struct
SimpleBits
{
__m256i
values
[
4
];
};
struct
EvenSignHelper
{
#ifdef HAVE_FANCY_SIMD
union
sbits_t
{
__m128i
vec
;
__mmask32
mask
[
4
];
};
IQK_ALWAYS_INLINE
void
sign_2_values
(
__m256i
aux
,
__m256i
*
values
)
const
{
aux
=
_mm256_and_si256
(
_mm256_srlv_epi32
(
aux
,
shifts
),
mask
);
auto
pcnt
=
_mm256_popcnt_epi32
(
aux
);
sbits_t
sbits
;
sbits
.
vec
=
_mm256_cvtepi32_epi8
(
_mm256_or_si256
(
aux
,
_mm256_slli_epi32
(
_mm256_and_si256
(
pcnt
,
mone
),
7
)));
values
[
0
]
=
_mm256_mask_sub_epi8
(
values
[
0
],
sbits
.
mask
[
0
],
_mm256_setzero_si256
(),
values
[
0
]);
values
[
1
]
=
_mm256_mask_sub_epi8
(
values
[
1
],
sbits
.
mask
[
1
],
_mm256_setzero_si256
(),
values
[
1
]);
//auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
//const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
//values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
//values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
}
const
__m256i
shifts
=
_mm256_set_epi32
(
21
,
14
,
7
,
0
,
21
,
14
,
7
,
0
);
const
__m256i
mask
=
_mm256_set1_epi32
(
127
);
const
__m256i
mone
=
_mm256_set1_epi32
(
1
);
#else
inline
void
sign_value
(
uint32_t
aux32
,
__m256i
&
value
)
const
{
auto
signs
=
_mm256_set_epi64x
(
keven_signs
[(
aux32
>>
21
)
&
127
],
keven_signs
[(
aux32
>>
14
)
&
127
],
keven_signs
[(
aux32
>>
7
)
&
127
],
keven_signs
[(
aux32
>>
0
)
&
127
]);
value
=
_mm256_sign_epi8
(
value
,
signs
);
}
#endif
};
/*
moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
add func
get_scale_shuffle_8
get_scale_shuffle_16
set_scales_16
*/
inline
__m256i
get_scale_shuffle_8
(
int
i
)
{
return
_mm256_set1_epi16
((
2
*
i
)
|
((
2
*
i
+
1
)
<<
8
));
}
inline
void
set_scales_8
(
const
__m256i
&
all_scales
,
int
j
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
3
));
}
inline
__m256i
get_scale_shuffle_16
(
int
i
)
{
static
const
uint8_t
k_shuffle
[
128
]
=
{
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
};
return
_mm256_loadu_si256
((
const
__m256i
*
)
k_shuffle
+
i
);
}
inline
void
set_scales_16
(
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
3
));
}
template
<
typename
Q8
,
typename
Bits
>
inline
void
multiply_add
(
const
Bits
&
bits
,
const
__m256i
*
scales
,
int
j
,
int
i
,
const
Q8
&
q8
,
__m256i
*
sumi
)
{
if
(
j
==
0
)
{
#ifdef HAVE_FANCY_SIMD
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
_mm256_setzero_si256
(),
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
0
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
1
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
2
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
3
)));
}
#else
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
0
)));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
1
)));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
2
)));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
3
)));
sumi
[
iy
]
=
_mm256_add_epi32
(
_mm256_add_epi32
(
p1
,
p3
),
_mm256_add_epi32
(
p2
,
p4
));
}
#endif
}
else
{
#ifdef HAVE_FANCY_SIMD
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
4
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
5
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
6
)));
sumi
[
iy
]
=
_mm256_dpwssd_epi32
(
sumi
[
iy
],
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
7
)));
}
#else
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
4
)));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
5
)));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
6
)));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
7
)));
sumi
[
iy
]
=
_mm256_add_epi32
(
sumi
[
iy
],
_mm256_add_epi32
(
p1
,
p3
));
sumi
[
iy
]
=
_mm256_add_epi32
(
sumi
[
iy
],
_mm256_add_epi32
(
p2
,
p4
));
}
#endif
}
}
/*
moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
add func
set_scales_8_iq
set_scales_16_iq
add MUL_MAT
mul_mat_qX_K_q8_K_IQ_1
mul_mat_qX_K_q8_K_IQ_N
mul_mat_qX_K_q8_K_IQ
*/
template
<
typename
Bits
>
inline
void
multiply_add_1
(
int
j
,
const
Bits
&
bits
,
const
__m256i
*
scales
,
const
__m256i
*
q8
,
__m256i
*
sumi
)
{
if
(
j
==
0
)
{
#ifdef HAVE_FANCY_SIMD
auto
p1
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
0
],
q8
[
0
]);
auto
p2
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
1
],
q8
[
1
]);
auto
p3
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
2
],
q8
[
2
]);
auto
p4
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
3
],
q8
[
3
]);
sumi
[
0
]
=
_mm256_dpwssd_epi32
(
_mm256_setzero_si256
(),
scales
[
0
],
_mm256_packs_epi32
(
p1
,
p2
));
sumi
[
1
]
=
_mm256_dpwssd_epi32
(
_mm256_setzero_si256
(),
scales
[
1
],
_mm256_packs_epi32
(
p3
,
p4
));
#else
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
[
0
]));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
[
1
]));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
[
2
]));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
[
3
]));
sumi
[
0
]
=
_mm256_add_epi32
(
p1
,
p3
);
sumi
[
1
]
=
_mm256_add_epi32
(
p2
,
p4
);
#endif
}
else
{
#ifdef HAVE_FANCY_SIMD
auto
p1
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
0
],
q8
[
0
]);
auto
p2
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
1
],
q8
[
1
]);
auto
p3
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
2
],
q8
[
2
]);
auto
p4
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
bits
.
values
[
3
],
q8
[
3
]);
sumi
[
0
]
=
_mm256_dpwssd_epi32
(
sumi
[
0
],
scales
[
0
],
_mm256_packs_epi32
(
p1
,
p2
));
sumi
[
1
]
=
_mm256_dpwssd_epi32
(
sumi
[
1
],
scales
[
1
],
_mm256_packs_epi32
(
p3
,
p4
));
#else
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
[
0
]));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
[
1
]));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
[
2
]));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
[
3
]));
sumi
[
0
]
=
_mm256_add_epi32
(
sumi
[
0
],
_mm256_add_epi32
(
p1
,
p3
));
sumi
[
1
]
=
_mm256_add_epi32
(
sumi
[
1
],
_mm256_add_epi32
(
p2
,
p4
));
#endif
}
}
inline
void
set_scales_8_iq
(
int
j
,
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
//#ifdef HAVE_FANCY_SIMD
auto
shuffle
=
j
==
0
?
_mm256_set_epi64x
(
0x0302030203020302
,
0x0100010001000100
,
0x0302030203020302
,
0x0100010001000100
)
:
_mm256_set_epi64x
(
0x0b0a0b0a0b0a0b0a
,
0x0908090809080908
,
0x0b0a0b0a0b0a0b0a
,
0x0908090809080908
);
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
shuffle
);
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
_mm256_add_epi8
(
shuffle
,
_mm256_set1_epi8
(
4
)));
//#else
// set_scales_8(all_scales, j, scales);
//#endif
}
inline
void
set_scales_16_iq
(
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
#ifdef HAVE_FANCY_SIMD
auto
shuffle
=
_mm256_set_epi64x
(
0x0706070607060706
,
0x0302030203020302
,
0x0504050405040504
,
0x0100010001000100
);
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
shuffle
);
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
_mm256_add_epi8
(
shuffle
,
_mm256_set1_epi8
(
8
)));
#else
set_scales_16
(
all_scales
,
scales
);
#endif
}
template
<
typename
Dequantizer
>
static
void
mul_mat_qX_K_q8_K_IQ_1
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
const
int
nb
=
n
/
QK_K
;
Q8
<
1
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256i
scales
[
2
];
__m256i
q8_quants
[
4
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
__m256
accd
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
__m256i
sumi
[
2
],
all_scales
[
Dequantizer
::
num_blocks
/
8
];
deq
.
new_block
(
i
,
all_scales
);
for
(
int
j
=
0
;
j
<
QK_K
/
128
;
++
j
)
{
deq
.
prepare
(
i
,
j
,
q8
,
q8_quants
);
if
constexpr
(
Dequantizer
::
num_blocks
==
8
)
{
set_scales_8_iq
(
j
,
all_scales
[
0
],
scales
);
}
else
{
set_scales_16_iq
(
all_scales
[
j
],
scales
);
}
multiply_add_1
(
j
,
deq
.
bits
,
scales
,
q8_quants
,
sumi
);
}
accd
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
deq
.
d
*
q8
.
scale
(
0
,
i
)),
_mm256_cvtepi32_ps
(
_mm256_add_epi32
(
sumi
[
0
],
sumi
[
1
])),
accd
);
}
info
.
store
(
ix
,
0
,
hsum_float_8
(
accd
));
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qX_K_q8_K_IQ_N
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256i
scales
[
4
];
__m256
accd
[
nrc_y
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
__m256i
sumi
[
nrc_y
],
all_scales
[
Dequantizer
::
num_blocks
/
8
];
//for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
__m256i
mins
;
float
dmin
=
deq
.
new_block
(
i
,
all_scales
,
mins
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
bsums
=
q8
.
load_bsums
(
iy
,
i
);
auto
prod
=
_mm256_madd_epi16
(
mins
,
bsums
);
accd
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
dmin
*
q8
.
scale
(
iy
,
i
)),
_mm256_cvtepi32_ps
(
prod
),
accd
[
iy
]);
}
for
(
int
j
=
0
;
j
<
QK_K
/
128
;
++
j
)
{
deq
.
prepare
(
i
,
j
);
if
constexpr
(
Dequantizer
::
num_blocks
==
8
)
{
set_scales_8
(
all_scales
[
0
],
j
,
scales
);
}
else
{
set_scales_16
(
all_scales
[
j
],
scales
);
}
//multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
multiply_add
(
deq
.
bits
,
scales
,
j
,
i
,
q8
,
sumi
);
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
const
__m256
vd
=
_mm256_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
));
accd
[
iy
]
=
_mm256_fmadd_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
[
iy
]),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
hsum_float_8
(
accd
[
iy
]));
}
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qX_K_q8_K_IQ
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
#ifdef HAVE_FANCY_SIMD
if
constexpr
(
nrc_y
==
1
)
{
mul_mat_qX_K_q8_K_IQ_1
<
Dequantizer
>
(
n
,
vx
,
bx
,
info
,
nrc_x
);
}
else
{
mul_mat_qX_K_q8_K_IQ_N
<
Dequantizer
,
nrc_y
>
(
n
,
vx
,
bx
,
info
,
nrc_x
);
}
#else
mul_mat_qX_K_q8_K_IQ_N
<
Dequantizer
,
nrc_y
>
(
n
,
vx
,
bx
,
info
,
nrc_x
);
#endif
}
/*
moonll iq1s
core func for iq1s mul_mat_iq1_s_q8_K
*/
template
<
int
nrc_y
>
static
void
mul_mat_iq1_s_q8_K
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
GGML_ASSERT
(
n
%
QK_K
==
0
);
Q8
<
nrc_y
,
block_q8_K
>
q8
(
info
);
__m256i
qx
[
8
];
__m256i
scales
[
4
];
__m256
acc
[
nrc_y
]
=
{};
auto
delta_mask
=
_mm_set1_epi16
(
-
32768
);
// to avoid stupid overflow warnings when using 0x8000
__m256i
shuffle0
=
_mm256_set_epi64x
(
0x0302030203020302
,
0x0100010001000100
,
0x0302030203020302
,
0x0100010001000100
);
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
auto
iq1s
=
(
const
block_iq1_s
*
)((
const
char
*
)
vx
+
ix
*
bx
);
for
(
int
ibl
=
0
;
ibl
<
n
/
QK_K
;
++
ibl
)
{
float
d
=
GGML_FP16_TO_FP32
(
iq1s
[
ibl
]
.
d
);
auto
qhb
=
_mm_loadu_si128
((
const
__m128i
*
)
iq1s
[
ibl
]
.
qh
);
auto
scales128
=
_mm_and_si128
(
_mm_srli_epi16
(
qhb
,
12
),
_mm_set1_epi16
(
7
));
scales128
=
_mm_add_epi16
(
_mm_slli_epi16
(
scales128
,
1
),
_mm_set1_epi16
(
1
));
#ifdef HAVE_FANCY_SIMD
auto
mask
=
_mm_cmpeq_epi16_mask
(
_mm_and_si128
(
qhb
,
delta_mask
),
delta_mask
);
auto
deltas128
=
_mm_mask_blend_epi16
(
mask
,
_mm_set1_epi16
(
-
7
),
_mm_set1_epi16
(
-
9
));
#else
auto
mask
=
_mm_cmpeq_epi16
(
_mm_and_si128
(
qhb
,
delta_mask
),
delta_mask
);
auto
deltas128
=
_mm_or_si128
(
_mm_and_si128
(
mask
,
_mm_set1_epi16
(
-
9
)),
_mm_andnot_si128
(
mask
,
_mm_set1_epi16
(
-
7
)));
#endif
deltas128
=
_mm_mullo_epi16
(
scales128
,
deltas128
);
scales128
=
_mm_slli_epi16
(
scales128
,
3
);
auto
deltas_l
=
_mm_unpacklo_epi16
(
deltas128
,
deltas128
);
auto
deltas_h
=
_mm_unpackhi_epi16
(
deltas128
,
deltas128
);
auto
deltas
=
MM256_SET_M128I
(
deltas_h
,
deltas_l
);
// blocks 0,0, 1,1, 2,2, ..., 7,7
auto
all_scales
=
MM256_SET_M128I
(
scales128
,
scales128
);
auto
shuffle
=
shuffle0
;
for
(
int
ib64
=
0
;
ib64
<
QK_K
/
64
;
++
ib64
)
{
scales
[
ib64
]
=
_mm256_shuffle_epi8
(
all_scales
,
shuffle
);
shuffle
=
_mm256_add_epi8
(
shuffle
,
_mm256_set1_epi8
(
4
));
}
const
uint8_t
*
qs
=
iq1s
[
ibl
]
.
qs
;
const
uint16_t
*
qh
=
iq1s
[
ibl
]
.
qh
;
for
(
int
ib
=
0
;
ib
<
QK_K
/
32
;
ib
+=
2
)
{
qx
[
ib
+
0
]
=
_mm256_set_epi64x
(
iq1s_grid_us
[
qs
[
3
]
|
((
qh
[
ib
+
0
]
>>
1
)
&
0x700
)],
iq1s_grid_us
[
qs
[
2
]
|
((
qh
[
ib
+
0
]
<<
2
)
&
0x700
)],
iq1s_grid_us
[
qs
[
1
]
|
((
qh
[
ib
+
0
]
<<
5
)
&
0x700
)],
iq1s_grid_us
[
qs
[
0
]
|
((
qh
[
ib
+
0
]
<<
8
)
&
0x700
)]);
qx
[
ib
+
1
]
=
_mm256_set_epi64x
(
iq1s_grid_us
[
qs
[
7
]
|
((
qh
[
ib
+
1
]
>>
1
)
&
0x700
)],
iq1s_grid_us
[
qs
[
6
]
|
((
qh
[
ib
+
1
]
<<
2
)
&
0x700
)],
iq1s_grid_us
[
qs
[
5
]
|
((
qh
[
ib
+
1
]
<<
5
)
&
0x700
)],
iq1s_grid_us
[
qs
[
4
]
|
((
qh
[
ib
+
1
]
<<
8
)
&
0x700
)]);
qs
+=
8
;
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
bsums
=
q8
.
load_bsums
(
iy
,
ibl
);
auto
sumi
=
_mm256_setzero_si256
();
for
(
int
ib64
=
0
;
ib64
<
QK_K
/
64
;
++
ib64
)
{
auto
qy1
=
q8
.
load_quants
(
iy
,
ibl
,
2
*
ib64
+
0
);
auto
qy2
=
q8
.
load_quants
(
iy
,
ibl
,
2
*
ib64
+
1
);
#ifdef HAVE_FANCY_SIMD
auto
dot1
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
qx
[
2
*
ib64
+
0
],
qy1
);
auto
dot2
=
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
qx
[
2
*
ib64
+
1
],
qy2
);
sumi
=
_mm256_dpwssd_epi32
(
sumi
,
scales
[
ib64
],
_mm256_packs_epi32
(
dot1
,
dot2
));
#else
auto
dot1
=
_mm256_maddubs_epi16
(
qx
[
2
*
ib64
+
0
],
qy1
);
auto
dot2
=
_mm256_maddubs_epi16
(
qx
[
2
*
ib64
+
1
],
qy2
);
auto
dot
=
_mm256_add_epi16
(
_mm256_unpacklo_epi64
(
dot1
,
dot2
),
_mm256_unpackhi_epi64
(
dot1
,
dot2
));
sumi
=
_mm256_add_epi32
(
sumi
,
_mm256_madd_epi16
(
scales
[
ib64
],
dot
));
#endif
}
#ifdef HAVE_FANCY_SIMD
sumi
=
_mm256_dpwssd_epi32
(
sumi
,
bsums
,
deltas
);
#else
sumi
=
_mm256_add_epi32
(
sumi
,
_mm256_madd_epi16
(
bsums
,
deltas
));
#endif
acc
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
d
*
q8
.
scale
(
iy
,
ibl
)),
_mm256_cvtepi32_ps
(
sumi
),
acc
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
0.125
f
*
hsum_float_8
(
acc
[
iy
]));
acc
[
iy
]
=
_mm256_setzero_ps
();
}
}
}
/*
moonll iq1s
DequantizerIQ2XXS
DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
*/
struct
DequantizerIQ2XXS
final
:
public
BaseDequantizer
<
block_iq2_xxs
>
{
DequantizerIQ2XXS
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
constexpr
static
int
num_blocks
=
8
;
union
Data
{
__m256i
vec
;
uint32_t
val
[
8
];
};
inline
__m128i
load_scales
(
int
i
)
{
d
=
0.125
f
*
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
const
uint16_t
*
a16
=
(
const
uint16_t
*
)
x
[
i
]
.
qs
;
auto
scales
=
_mm_srli_epi16
(
_mm_set_epi16
(
a16
[
31
],
a16
[
27
],
a16
[
23
],
a16
[
19
],
a16
[
15
],
a16
[
11
],
a16
[
7
],
a16
[
3
]),
12
);
return
_mm_or_si128
(
_mm_slli_epi16
(
scales
,
1
),
_mm_set1_epi16
(
1
));
}
inline
void
new_block
(
int
i
,
__m256i
*
scales
)
{
auto
sc16
=
load_scales
(
i
);
scales
[
0
]
=
MM256_SET_M128I
(
sc16
,
sc16
);
}
inline
float
new_block
(
int
i
,
__m256i
*
scales
,
__m256i
&
mins
)
{
auto
sc16
=
load_scales
(
i
);
mins
=
scb
.
shuffle
(
sc16
);
scales
[
0
]
=
MM256_SET_M128I
(
sc16
,
sc16
);
return
-
d
*
minv
;
}
inline
static
void
make4
(
const
uint32_t
*
aux32
,
__m256i
*
values
)
{
const
uint8_t
*
aux8
=
(
const
uint8_t
*
)
aux32
;
values
[
0
]
=
_mm256_set_epi64x
(
iq2xxs_grid
[
aux8
[
3
]],
iq2xxs_grid
[
aux8
[
2
]],
iq2xxs_grid
[
aux8
[
1
]],
iq2xxs_grid
[
aux8
[
0
]]);
values
[
1
]
=
_mm256_set_epi64x
(
iq2xxs_grid
[
aux8
[
11
]],
iq2xxs_grid
[
aux8
[
10
]],
iq2xxs_grid
[
aux8
[
9
]],
iq2xxs_grid
[
aux8
[
8
]]);
values
[
2
]
=
_mm256_set_epi64x
(
iq2xxs_grid
[
aux8
[
19
]],
iq2xxs_grid
[
aux8
[
18
]],
iq2xxs_grid
[
aux8
[
17
]],
iq2xxs_grid
[
aux8
[
16
]]);
values
[
3
]
=
_mm256_set_epi64x
(
iq2xxs_grid
[
aux8
[
27
]],
iq2xxs_grid
[
aux8
[
26
]],
iq2xxs_grid
[
aux8
[
25
]],
iq2xxs_grid
[
aux8
[
24
]]);
}
IQK_ALWAYS_INLINE
void
sign_values
(
const
uint32_t
*
aux32
,
__m256i
*
values
)
const
{
#ifdef HAVE_FANCY_SIMD
esh
.
sign_2_values
(
MM256_SET_M128I
(
_mm_set1_epi32
(
aux32
[
3
]),
_mm_set1_epi32
(
aux32
[
1
])),
values
+
0
);
esh
.
sign_2_values
(
MM256_SET_M128I
(
_mm_set1_epi32
(
aux32
[
7
]),
_mm_set1_epi32
(
aux32
[
5
])),
values
+
2
);
#else
esh
.
sign_value
(
aux32
[
1
],
values
[
0
]);
esh
.
sign_value
(
aux32
[
3
],
values
[
1
]);
esh
.
sign_value
(
aux32
[
5
],
values
[
2
]);
esh
.
sign_value
(
aux32
[
7
],
values
[
3
]);
#endif
}
inline
void
make4_signed
(
const
uint32_t
*
aux32
,
const
__m256i
&
min_value
,
__m256i
*
values
)
const
{
make4
(
aux32
,
values
);
sign_values
(
aux32
,
values
);
for
(
int
k
=
0
;
k
<
4
;
++
k
)
values
[
k
]
=
_mm256_add_epi8
(
values
[
k
],
min_value
);
}
inline
void
make4
(
const
uint32_t
*
aux32
,
__m256i
*
values
,
__m256i
*
q8
)
const
{
make4
(
aux32
,
values
);
sign_values
(
aux32
,
q8
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
Data
data
;
data
.
vec
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
]
.
qs
+
j
);
make4_signed
(
data
.
val
,
min_value
,
bits
.
values
);
}
inline
void
prepare
(
int
i
,
int
j
,
const
Q8
<
1
>&
q8
,
__m256i
*
q8_quants
)
{
for
(
int
k
=
0
;
k
<
4
;
++
k
)
q8_quants
[
k
]
=
q8
.
load_quants
(
0
,
i
,
4
*
j
+
k
);
Data
data
;
data
.
vec
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
]
.
qs
+
j
);
make4
(
data
.
val
,
bits
.
values
,
q8_quants
);
}
constexpr
static
int
minv
=
43
;
SimpleBits
bits
;
Scales8KBase
scb
;
EvenSignHelper
esh
;
const
__m256i
min_value
=
_mm256_set1_epi8
(
minv
);
const
__m256i
shuffle
=
_mm256_set_epi32
(
7
,
5
,
3
,
1
,
7
,
5
,
3
,
1
);
};
/*
moonll
add Q8_0_Unpacker && DequantizerIQ2XXS support
add func mul_mat_qX_K_q8_K_IQ
*/
template
<
typename
Dequantizer
>
void
MulMat
::
set_functions
(
MulMat
&
m
)
{
template
<
typename
Dequantizer
>
void
MulMat
::
set_functions
(
MulMat
&
m
)
{
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_0_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_0_Unpacker
>
)
{
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_0_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_0_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q8_0_Unpacker
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
0
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
3
>
;
...
@@ -1364,7 +2835,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
...
@@ -1364,7 +2835,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
m
.
funcs
[
6
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
8
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
8
>
;
}
}
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_1_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_1_Unpacker
>
)
{
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_1_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_1_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q8_0_1_Unpacker
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
0
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
3
>
;
...
@@ -1374,16 +2845,37 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
...
@@ -1374,16 +2845,37 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
m
.
funcs
[
6
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
8
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
8
>
;
}
}
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ2XXS
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_IQ
<
Dequantizer
,
8
>
;
}
else
{
else
{
#ifdef HAVE_FANCY_SIMD
#ifdef HAVE_FANCY_SIMD
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
1
>
;
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ4XS
>
)
{
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
0
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
1
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
1
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
2
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
4
>
;
m
.
funcs
[
2
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
3
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
5
>
;
m
.
funcs
[
3
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
4
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
6
>
;
m
.
funcs
[
4
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
5
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
5
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
6
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
8
>
;
m
.
funcs
[
6
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_iqX_k_q8_K_AVX512
<
Dequantizer
,
8
>
;
}
else
{
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_AVX512_1
<
Dequantizer
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_AVX512
<
Dequantizer
,
8
>
;
}
#else
#else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerQ2K
>
||
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerQ2K
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ3K
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ3K
>
||
...
@@ -1410,11 +2902,260 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
...
@@ -1410,11 +2902,260 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
}
}
}
}
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
)
{
struct
QFBase
{
#ifdef __AVX512F__
constexpr
static
int
k_step
=
16
;
using
Data
=
__m512
;
using
Acc
=
__m512
;
static
inline
Data
load
(
const
ggml_half
*
x
)
{
return
_mm512_cvtph_ps
(
_mm256_loadu_si256
((
const
__m256i
*
)
x
));
}
static
inline
Data
load
(
const
float
*
x
)
{
return
_mm512_loadu_ps
(
x
);
}
static
inline
Data
load
(
const
ggml_bf16_t
*
x
)
{
return
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
((
const
__m256i
*
)
x
)),
16
));
}
static
inline
Acc
acc
(
Acc
prev
,
const
Data
&
y
,
const
Data
&
x
)
{
return
_mm512_fmadd_ps
(
y
,
x
,
prev
);
}
static
inline
Acc
acc_first
(
const
Data
&
y
,
const
Data
&
x
)
{
return
_mm512_mul_ps
(
y
,
x
);
}
static
inline
Acc
add
(
Acc
x
,
Acc
y
)
{
return
_mm512_add_ps
(
x
,
y
);
}
static
inline
float
hsum
(
Acc
acc
)
{
return
_mm512_reduce_add_ps
(
acc
);
}
template
<
typename
Float
>
static
inline
Data
load4Floats
(
const
Float
*
x
)
{
return
_mm512_insertf32x4
(
_mm512_setzero_ps
(),
load128
(
x
),
0
);
}
static
inline
Acc
acc_r4
(
Acc
acc
,
const
Data
*
xv
,
const
Data
&
yv
)
{
acc
=
_mm512_fmadd_ps
(
xv
[
0
],
_mm512_shuffle_ps
(
yv
,
yv
,
0x00
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
1
],
_mm512_shuffle_ps
(
yv
,
yv
,
0x55
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
2
],
_mm512_shuffle_ps
(
yv
,
yv
,
0xaa
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
3
],
_mm512_shuffle_ps
(
yv
,
yv
,
0xff
),
acc
);
return
acc
;
}
static
inline
Acc
acc_r4_first
(
const
Data
*
xv
,
const
Data
&
yv
)
{
auto
acc
=
_mm512_mul_ps
(
xv
[
0
],
_mm512_shuffle_ps
(
yv
,
yv
,
0x00
));
acc
=
_mm512_fmadd_ps
(
xv
[
1
],
_mm512_shuffle_ps
(
yv
,
yv
,
0x55
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
2
],
_mm512_shuffle_ps
(
yv
,
yv
,
0xaa
),
acc
);
acc
=
_mm512_fmadd_ps
(
xv
[
3
],
_mm512_shuffle_ps
(
yv
,
yv
,
0xff
),
acc
);
return
acc
;
}
static
inline
__m128
hsum_r4
(
Acc
acc
)
{
auto
sum1
=
_mm_add_ps
(
_mm512_extractf32x4_ps
(
acc
,
0
),
_mm512_extractf32x4_ps
(
acc
,
1
));
auto
sum2
=
_mm_add_ps
(
_mm512_extractf32x4_ps
(
acc
,
2
),
_mm512_extractf32x4_ps
(
acc
,
3
));
return
_mm_add_ps
(
sum1
,
sum2
);
}
#else
constexpr
static
int
k_step
=
8
;
using
Data
=
__m256
;
using
Acc
=
__m256
;
static
inline
Data
load
(
const
ggml_half
*
x
)
{
return
_mm256_cvtph_ps
(
_mm_loadu_si128
((
const
__m128i
*
)
x
));
}
static
inline
Data
load
(
const
float
*
x
)
{
return
_mm256_loadu_ps
(
x
);
}
static
inline
Data
load
(
const
ggml_bf16_t
*
x
)
{
return
_mm256_castsi256_ps
(
_mm256_slli_epi32
(
_mm256_cvtepu16_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)
x
)),
16
));
}
static
inline
Acc
acc
(
Acc
prev
,
const
Data
&
y
,
const
Data
&
x
)
{
return
_mm256_fmadd_ps
(
y
,
x
,
prev
);
}
static
inline
Acc
add
(
Acc
x
,
Acc
y
)
{
return
_mm256_add_ps
(
x
,
y
);
}
static
inline
Acc
acc_r4
(
Acc
acc
,
const
Data
*
xv
,
const
Data
&
yv
)
{
acc
=
_mm256_fmadd_ps
(
xv
[
0
],
_mm256_shuffle_ps
(
yv
,
yv
,
0x00
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
1
],
_mm256_shuffle_ps
(
yv
,
yv
,
0x55
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
2
],
_mm256_shuffle_ps
(
yv
,
yv
,
0xaa
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
3
],
_mm256_shuffle_ps
(
yv
,
yv
,
0xff
),
acc
);
return
acc
;
}
static
inline
Acc
acc_r4_first
(
const
Data
*
xv
,
const
Data
&
yv
)
{
auto
acc
=
_mm256_mul_ps
(
xv
[
0
],
_mm256_shuffle_ps
(
yv
,
yv
,
0x00
));
acc
=
_mm256_fmadd_ps
(
xv
[
1
],
_mm256_shuffle_ps
(
yv
,
yv
,
0x55
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
2
],
_mm256_shuffle_ps
(
yv
,
yv
,
0xaa
),
acc
);
acc
=
_mm256_fmadd_ps
(
xv
[
3
],
_mm256_shuffle_ps
(
yv
,
yv
,
0xff
),
acc
);
return
acc
;
}
static
inline
Acc
acc_first
(
const
Data
&
y
,
const
Data
&
x
)
{
return
_mm256_mul_ps
(
y
,
x
);
}
static
inline
float
hsum
(
Acc
acc
)
{
return
hsum_float_8
(
acc
);
}
static
inline
__m128
hsum_r4
(
Acc
acc
)
{
return
_mm_add_ps
(
_mm256_castps256_ps128
(
acc
),
_mm256_extractf128_ps
(
acc
,
1
));
}
template
<
typename
Float
>
static
inline
Data
load4Floats
(
const
Float
*
x
)
{
return
_mm256_insertf128_ps
(
_mm256_setzero_ps
(),
load128
(
x
),
0
);
}
#endif
static
inline
__m128
load128
(
const
ggml_half
*
x
)
{
return
_mm_cvtph_ps
(
_mm_loadl_epi64
((
const
__m128i
*
)
x
));
}
static
inline
__m128
load128
(
const
float
*
x
)
{
return
_mm_loadu_ps
(
x
);
}
static
inline
__m128
load128
(
const
ggml_bf16_t
*
x
)
{
return
_mm_castsi128_ps
(
_mm_slli_epi32
(
_mm_cvtepu16_epi32
(
_mm_loadl_epi64
((
const
__m128i
*
)
x
)),
16
));
}
};
template
<
typename
Float
,
int
nrc_in
>
struct
QFT
final
:
public
QFBase
{
constexpr
static
int
nrc
=
nrc_in
;
QFT
(
const
DataInfo
&
info
)
{
for
(
int
iy
=
0
;
iy
<
nrc
;
++
iy
)
y
[
iy
]
=
(
const
Float
*
)
info
.
src1_row
(
iy
);
}
QFT
(
const
char
*
cx
,
size_t
bx
)
{
for
(
int
iy
=
0
;
iy
<
nrc
;
++
iy
)
y
[
iy
]
=
(
const
Float
*
)(
cx
+
iy
*
bx
);
}
IQK_ALWAYS_INLINE
Data
load1
(
int
iy
,
int
i
)
const
{
return
load
(
y
[
iy
]
+
k_step
*
i
);
}
IQK_ALWAYS_INLINE
Data
load_tail
(
int
iy
,
int
i
)
const
{
return
load4Floats
(
y
[
iy
]
+
4
*
i
);
}
IQK_ALWAYS_INLINE
void
load_r4
(
int
ix
,
int
i
,
Data
*
xv
)
const
{
xv
[
0
]
=
load1
(
ix
+
0
,
i
);
xv
[
1
]
=
load1
(
ix
+
1
,
i
);
xv
[
2
]
=
load1
(
ix
+
2
,
i
);
xv
[
3
]
=
load1
(
ix
+
3
,
i
);
#ifdef __AVX512F__
auto
t0
=
_mm512_unpacklo_ps
(
xv
[
0
],
xv
[
1
]);
auto
t1
=
_mm512_unpacklo_ps
(
xv
[
2
],
xv
[
3
]);
auto
t2
=
_mm512_unpackhi_ps
(
xv
[
0
],
xv
[
1
]);
auto
t3
=
_mm512_unpackhi_ps
(
xv
[
2
],
xv
[
3
]);
xv
[
0
]
=
_mm512_castpd_ps
(
_mm512_unpacklo_pd
(
_mm512_castps_pd
(
t0
),
_mm512_castps_pd
(
t1
)));
xv
[
1
]
=
_mm512_castpd_ps
(
_mm512_unpackhi_pd
(
_mm512_castps_pd
(
t0
),
_mm512_castps_pd
(
t1
)));
xv
[
2
]
=
_mm512_castpd_ps
(
_mm512_unpacklo_pd
(
_mm512_castps_pd
(
t2
),
_mm512_castps_pd
(
t3
)));
xv
[
3
]
=
_mm512_castpd_ps
(
_mm512_unpackhi_pd
(
_mm512_castps_pd
(
t2
),
_mm512_castps_pd
(
t3
)));
#else
auto
t0
=
_mm256_unpacklo_ps
(
xv
[
0
],
xv
[
1
]);
auto
t1
=
_mm256_unpacklo_ps
(
xv
[
2
],
xv
[
3
]);
auto
t2
=
_mm256_unpackhi_ps
(
xv
[
0
],
xv
[
1
]);
auto
t3
=
_mm256_unpackhi_ps
(
xv
[
2
],
xv
[
3
]);
xv
[
0
]
=
_mm256_castpd_ps
(
_mm256_unpacklo_pd
(
_mm256_castps_pd
(
t0
),
_mm256_castps_pd
(
t1
)));
xv
[
1
]
=
_mm256_castpd_ps
(
_mm256_unpackhi_pd
(
_mm256_castps_pd
(
t0
),
_mm256_castps_pd
(
t1
)));
xv
[
2
]
=
_mm256_castpd_ps
(
_mm256_unpacklo_pd
(
_mm256_castps_pd
(
t2
),
_mm256_castps_pd
(
t3
)));
xv
[
3
]
=
_mm256_castpd_ps
(
_mm256_unpackhi_pd
(
_mm256_castps_pd
(
t2
),
_mm256_castps_pd
(
t3
)));
#endif
}
const
Float
*
y
[
nrc
];
};
if
(
ne00
%
ggml_blck_size
(
GGML_TYPE_Q8_K
)
==
0
)
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_K
,
ne00
);
template
<
typename
Qy
,
typename
Qx
>
IQK_NOINLINE
void
mul_mat_Qx_Qy_MxN
(
int
n
,
const
char
*
cx
,
size_t
bx
,
int
ix0
,
const
DataInfo
&
info
)
{
int
nb
=
n
/
QFBase
::
k_step
;
int
nb4
=
n
/
4
;
Qy
y
(
info
);
Qx
x
(
cx
+
ix0
*
bx
,
bx
);
QFBase
::
Data
xv
[
Qx
::
nrc
];
QFBase
::
Acc
acc
[
Qx
::
nrc
*
Qy
::
nrc
];
auto
yv
=
y
.
load1
(
0
,
0
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
{
xv
[
ix
]
=
x
.
load1
(
ix
,
0
);
acc
[
ix
]
=
QFBase
::
acc_first
(
yv
,
xv
[
ix
]);
}
for
(
int
iy
=
1
;
iy
<
Qy
::
nrc
;
++
iy
)
{
yv
=
y
.
load1
(
iy
,
0
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
acc
[
Qx
::
nrc
*
iy
+
ix
]
=
QFBase
::
acc_first
(
yv
,
xv
[
ix
]);
}
for
(
int
i
=
1
;
i
<
nb
;
++
i
)
{
yv
=
y
.
load1
(
0
,
i
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
{
xv
[
ix
]
=
x
.
load1
(
ix
,
i
);
acc
[
ix
]
=
QFBase
::
acc
(
acc
[
ix
],
yv
,
xv
[
ix
]);
}
for
(
int
iy
=
1
;
iy
<
Qy
::
nrc
;
++
iy
)
{
yv
=
y
.
load1
(
iy
,
i
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
acc
[
Qx
::
nrc
*
iy
+
ix
]
=
QFBase
::
acc
(
acc
[
Qx
::
nrc
*
iy
+
ix
],
yv
,
xv
[
ix
]);
}
}
for
(
int
i
=
(
QFBase
::
k_step
/
4
)
*
nb
;
i
<
nb4
;
++
i
)
{
yv
=
y
.
load_tail
(
0
,
i
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
{
xv
[
ix
]
=
x
.
load_tail
(
ix
,
i
);
acc
[
ix
]
=
QFBase
::
acc
(
acc
[
ix
],
yv
,
xv
[
ix
]);
}
for
(
int
iy
=
1
;
iy
<
Qy
::
nrc
;
++
iy
)
{
yv
=
y
.
load_tail
(
iy
,
i
);
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
acc
[
Qx
::
nrc
*
iy
+
ix
]
=
QFBase
::
acc
(
acc
[
Qx
::
nrc
*
iy
+
ix
],
yv
,
xv
[
ix
]);
}
}
for
(
int
iy
=
0
;
iy
<
Qy
::
nrc
;
++
iy
)
for
(
int
ix
=
0
;
ix
<
Qx
::
nrc
;
++
ix
)
info
.
store
(
ix0
+
ix
,
iy
,
QFBase
::
hsum
(
acc
[
Qx
::
nrc
*
iy
+
ix
]));
}
// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
template
<
int
nrc_y
,
typename
FloatX
,
typename
FloatY
>
void
mul_mat_fX_fY_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
const
char
*
cx
=
(
const
char
*
)
vx
;
// TBD if we want this
//if constexpr (nrc_y == 1) {
// constexpr int k_nx = 2;
// for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
// mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
// }
// if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
// int nx = nrc_x - lastx;
// switch (nx) {
// case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
// case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
// case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
// }
// //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
// }
// return;
//}
#ifdef __AVX512F__
constexpr
int
k_nx
=
5
;
#else
constexpr
int
k_nx
=
nrc_y
==
1
?
4
:
2
;
#endif
for
(
int
ix
=
0
;
ix
<
nrc_x
/
k_nx
;
++
ix
)
{
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
k_nx
>>
(
n
,
cx
,
bx
,
ix
*
k_nx
,
info
);
}
int
last_x
=
k_nx
*
(
nrc_x
/
k_nx
);
if
(
last_x
==
nrc_x
)
return
;
int
nx
=
nrc_x
-
last_x
;
#ifdef __AVX512F__
switch
(
nx
)
{
case
1
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
1
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
2
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
2
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
3
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
3
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
4
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
4
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
}
#else
if
constexpr
(
nrc_y
==
1
)
{
switch
(
nx
)
{
case
1
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
1
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
2
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
2
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
case
3
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
3
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
}
}
else
{
switch
(
nx
)
{
case
1
:
mul_mat_Qx_Qy_MxN
<
QFT
<
FloatY
,
nrc_y
>
,
QFT
<
FloatX
,
1
>>
(
n
,
cx
,
bx
,
last_x
,
info
);
break
;
}
}
#endif
}
template
<
typename
FloatX
,
typename
FloatY
>
void
set_mul_mat_f
(
MulMat
&
mm
)
{
for
(
auto
&
f
:
mm
.
funcs
)
f
=
nullptr
;
mm
.
funcs
[
0
]
=
mul_mat_fX_fY_T
<
1
,
FloatX
,
FloatY
>
;
mm
.
funcs
[
1
]
=
mul_mat_fX_fY_T
<
2
,
FloatX
,
FloatY
>
;
mm
.
funcs
[
2
]
=
mul_mat_fX_fY_T
<
3
,
FloatX
,
FloatY
>
;
mm
.
funcs
[
3
]
=
mul_mat_fX_fY_T
<
4
,
FloatX
,
FloatY
>
;
mm
.
funcs
[
4
]
=
mul_mat_fX_fY_T
<
5
,
FloatX
,
FloatY
>
;
#ifndef __AVX512F__
mm
.
funcs
[
5
]
=
mul_mat_fX_fY_T
<
6
,
FloatX
,
FloatY
>
;
#endif
}
/*
moonll
add typeb TO compare return not expected type of weight matrix
add IQ2XSS
add IQ1_S
add GGML_TYPE_IQ4_XS
*/
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
typeB
,
int
ne00
,
MulMat
&
mm
,
int
Ny
)
{
(
void
)
Ny
;
auto
expected_typeB
=
GGML_TYPE_Q8_K
;
switch
(
typeA
)
{
switch
(
typeA
)
{
case
GGML_TYPE_Q2_K
:
case
GGML_TYPE_Q2_K
:
assert
(
ne00
%
QK_K
==
0
);
assert
(
ne00
%
QK_K
==
0
);
...
@@ -1440,37 +3181,75 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int)
...
@@ -1440,37 +3181,75 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int)
assert
(
ne00
%
QK_K
==
0
);
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerIQ4XS
>
(
mm
);
MulMat
::
set_functions
<
DequantizerIQ4XS
>
(
mm
);
break
;
break
;
case
GGML_TYPE_IQ2_XXS
:
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerIQ2XXS
>
(
mm
);
break
;
case
GGML_TYPE_Q4_0
:
case
GGML_TYPE_Q4_0
:
assert
(
ne00
%
QK4_0
==
0
);
assert
(
ne00
%
QK4_0
==
0
);
MulMat
::
set_functions
<
Q4_0_Unpacker
>
(
mm
);
MulMat
::
set_functions
<
Q4_0_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
)
;
expected_typeB
=
GGML_TYPE_Q8_0
;
break
;
break
;
case
GGML_TYPE_Q4_1
:
case
GGML_TYPE_Q4_1
:
assert
(
ne00
%
QK4_1
==
0
);
assert
(
ne00
%
QK4_1
==
0
);
MulMat
::
set_functions
<
Q4_1_Unpacker
>
(
mm
);
MulMat
::
set_functions
<
Q4_1_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_1
,
ne00
)
;
expected_typeB
=
GGML_TYPE_Q8_1
_X4
;
break
;
break
;
case
GGML_TYPE_Q5_0
:
case
GGML_TYPE_Q5_0
:
assert
(
ne00
%
QK5_0
==
0
);
assert
(
ne00
%
QK5_0
==
0
);
MulMat
::
set_functions
<
Q5_0_Unpacker
>
(
mm
);
MulMat
::
set_functions
<
Q5_0_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
)
;
expected_typeB
=
GGML_TYPE_Q8_0
;
break
;
break
;
case
GGML_TYPE_Q5_1
:
case
GGML_TYPE_Q5_1
:
assert
(
ne00
%
QK5_1
==
0
);
assert
(
ne00
%
QK5_1
==
0
);
MulMat
::
set_functions
<
Q5_1_Unpacker
>
(
mm
);
MulMat
::
set_functions
<
Q5_1_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_1
,
ne00
);
expected_typeB
=
GGML_TYPE_Q8_1_X4
;
break
;
case
GGML_TYPE_Q8_0
:
assert
(
ne00
%
QK8_0
==
0
);
#ifdef HAVE_FANCY_SIMD
MulMat
::
set_functions
<
Q8_0_1_Unpacker
>
(
mm
);
expected_typeB
=
GGML_TYPE_Q8_1_X4
;
#else
MulMat
::
set_functions
<
Q8_0_Unpacker
>
(
mm
);
expected_typeB
=
GGML_TYPE_Q8_0_X4
;
#endif
break
;
case
GGML_TYPE_IQ1_S
:
mm
.
funcs
[
0
]
=
mul_mat_iq1_s_q8_K
<
1
>
;
mm
.
funcs
[
1
]
=
mul_mat_iq1_s_q8_K
<
2
>
;
mm
.
funcs
[
2
]
=
mul_mat_iq1_s_q8_K
<
3
>
;
mm
.
funcs
[
3
]
=
mul_mat_iq1_s_q8_K
<
4
>
;
mm
.
funcs
[
4
]
=
mul_mat_iq1_s_q8_K
<
5
>
;
mm
.
funcs
[
5
]
=
mul_mat_iq1_s_q8_K
<
6
>
;
mm
.
funcs
[
6
]
=
mul_mat_iq1_s_q8_K
<
7
>
;
mm
.
funcs
[
7
]
=
mul_mat_iq1_s_q8_K
<
8
>
;
#ifdef HAVE_FANCY_SIMD
mm
.
func16
=
mul_mat_iq1_s_q8_K
<
16
>
;
#endif
// row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
expected_typeB
=
GGML_TYPE_Q8_K
;
break
;
break
;
default
:
default
:
{
printf
(
"case:%d"
,
typeA
);
return
false
;
return
false
;
}
}
return
true
;
}
return
ggml_type
(
typeB
)
==
expected_typeB
;
}
}
}
// namespace
}
// namespace
/*
iq1_s is not support for arm
*/
#else // __aarch64__
#else // __aarch64__
namespace
{
namespace
{
...
...
third_party/llamafile/sgemm.h
View file @
216a63b8
...
@@ -12,10 +12,15 @@ extern "C" {
...
@@ -12,10 +12,15 @@ extern "C" {
struct
ggml_tensor
;
struct
ggml_tensor
;
struct
ggml_compute_params
;
struct
ggml_compute_params
;
/*moonll old
add more params typeb...
*/
bool
iqk_mul_mat
(
long
,
long
,
long
,
int
,
const
void
*
,
long
,
int
,
const
void
*
,
long
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_zen4
(
long
,
long
,
long
,
int
,
const
void
*
,
long
,
int
,
const
void
*
,
long
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_arm82
(
long
,
long
,
long
,
int
,
const
void
*
,
long
,
int
,
const
void
*
,
long
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat
(
long
,
long
,
long
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_zen4
(
long
,
long
,
long
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_arm82
(
long
,
long
,
long
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
int
,
int
);
bool
iqk_mul_mat_moe
(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
bool
iqk_mul_mat_moe
(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
bool
iqk_mul_mat_moe_zen4
(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
bool
iqk_mul_mat_moe_zen4
(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
...
...
third_party/llamafile/tinyblas_cpu_sgemm.inc
View file @
216a63b8
...
@@ -323,20 +323,17 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
...
@@ -323,20 +323,17 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
#if QK_K == 256
#if QK_K == 256
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
// if (X86_CHECK(AVX2) && X86_CHECK(FMA)) {
/*
if
(
Btype
==
GGML_TYPE_Q8_K
&&
Ctype
==
GGML_TYPE_F32
)
{
moonll
if
(
iqk_mul_mat
(
m
,
n
,
k
*
QK_K
,
Atype
,
A
,
B
,
(
float
*
)
C
,
ldc
,
ith
,
nth
))
{
more Btype accept
return
true
;
}*/
}
}
if
(
Ctype
==
GGML_TYPE_F32
){
if
((
Btype
==
GGML_TYPE_Q8_0
||
Btype
==
GGML_TYPE_Q8_1
)
&&
Ctype
==
GGML_TYPE_F32
)
{
if
(
iqk_mul_mat
(
m
,
n
,
k
*
ggml_blck_size
(
ggml_type
(
Atype
)),
Atype
,
A
,
lda
,
Btype
,
B
,
ldb
,
(
float
*
)
C
,
ldc
,
ith
,
nth
))
{
// assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
assert
((
QK8_0
==
32
)
&&
(
QK8_1
==
32
)
&&
(
QK4_0
==
32
)
&&
(
QK4_1
==
32
)
&&
(
QK5_0
==
32
)
&&
(
QK5_1
==
32
));
if
(
iqk_mul_mat
(
m
,
n
,
k
*
QK8_0
,
Atype
,
A
,
B
,
(
float
*
)
C
,
ldc
,
ith
,
nth
))
{
return
true
;
return
true
;
}
}
}
}
// }
#endif
#endif
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
if
(
Btype
==
GGML_TYPE_Q8_K
&&
Ctype
==
GGML_TYPE_F32
)
{
if
(
Btype
==
GGML_TYPE_Q8_K
&&
Ctype
==
GGML_TYPE_F32
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment