Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
d3ad6274
Commit
d3ad6274
authored
Nov 12, 2024
by
xuxzh1
🎱
Browse files
init
parent
97b02a89
Changes
193
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3111 deletions
+0
-3111
llm/llama.cpp/ggml-cuda/im2col.cu
llm/llama.cpp/ggml-cuda/im2col.cu
+0
-104
llm/llama.cpp/ggml-cuda/im2col.cuh
llm/llama.cpp/ggml-cuda/im2col.cuh
+0
-5
llm/llama.cpp/ggml-cuda/mmq.cu
llm/llama.cpp/ggml-cuda/mmq.cu
+0
-1564
llm/llama.cpp/ggml-cuda/mmq.cuh
llm/llama.cpp/ggml-cuda/mmq.cuh
+0
-9
llm/llama.cpp/ggml-cuda/mmvq.cu
llm/llama.cpp/ggml-cuda/mmvq.cu
+0
-404
llm/llama.cpp/ggml-cuda/mmvq.cuh
llm/llama.cpp/ggml-cuda/mmvq.cuh
+0
-7
llm/llama.cpp/ggml-cuda/norm.cu
llm/llama.cpp/ggml-cuda/norm.cu
+0
-221
llm/llama.cpp/ggml-cuda/norm.cuh
llm/llama.cpp/ggml-cuda/norm.cuh
+0
-7
llm/llama.cpp/ggml-cuda/pad.cu
llm/llama.cpp/ggml-cuda/pad.cu
+0
-49
llm/llama.cpp/ggml-cuda/pad.cuh
llm/llama.cpp/ggml-cuda/pad.cuh
+0
-5
llm/llama.cpp/ggml-cuda/pool2d.cu
llm/llama.cpp/ggml-cuda/pool2d.cu
+0
-94
llm/llama.cpp/ggml-cuda/pool2d.cuh
llm/llama.cpp/ggml-cuda/pool2d.cuh
+0
-5
llm/llama.cpp/ggml-cuda/quantize.cu
llm/llama.cpp/ggml-cuda/quantize.cu
+0
-45
llm/llama.cpp/ggml-cuda/quantize.cuh
llm/llama.cpp/ggml-cuda/quantize.cuh
+0
-5
llm/llama.cpp/ggml-cuda/rope.cu
llm/llama.cpp/ggml-cuda/rope.cu
+0
-330
llm/llama.cpp/ggml-cuda/rope.cuh
llm/llama.cpp/ggml-cuda/rope.cuh
+0
-5
llm/llama.cpp/ggml-cuda/scale.cu
llm/llama.cpp/ggml-cuda/scale.cu
+0
-31
llm/llama.cpp/ggml-cuda/scale.cuh
llm/llama.cpp/ggml-cuda/scale.cuh
+0
-5
llm/llama.cpp/ggml-cuda/softmax.cu
llm/llama.cpp/ggml-cuda/softmax.cu
+0
-211
llm/llama.cpp/ggml-cuda/softmax.cuh
llm/llama.cpp/ggml-cuda/softmax.cuh
+0
-5
No files found.
Too many changes to show.
To preserve performance only
193 of 193+
files are displayed.
Plain diff
Email patch
llm/llama.cpp/ggml-cuda/im2col.cu
deleted
100644 → 0
View file @
97b02a89
#include "im2col.cuh"
template
<
typename
T
>
static
__global__
void
im2col_kernel
(
const
float
*
x
,
T
*
dst
,
int64_t
batch_offset
,
int64_t
offset_delta
,
int64_t
IC
,
int64_t
IW
,
int64_t
IH
,
int64_t
OH
,
int64_t
OW
,
int64_t
KW
,
int64_t
KH
,
int64_t
pelements
,
int64_t
CHW
,
int
s0
,
int
s1
,
int
p0
,
int
p1
,
int
d0
,
int
d1
)
{
const
int64_t
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
i
>=
pelements
)
{
return
;
}
const
int64_t
ksize
=
OW
*
(
KH
>
1
?
KW
:
1
);
const
int64_t
kx
=
i
/
ksize
;
const
int64_t
kd
=
kx
*
ksize
;
const
int64_t
ky
=
(
i
-
kd
)
/
OW
;
const
int64_t
ix
=
i
%
OW
;
const
int64_t
oh
=
blockIdx
.
y
;
const
int64_t
batch
=
blockIdx
.
z
/
IC
;
const
int64_t
ic
=
blockIdx
.
z
%
IC
;
const
int64_t
iiw
=
ix
*
s0
+
kx
*
d0
-
p0
;
const
int64_t
iih
=
oh
*
s1
+
ky
*
d1
-
p1
;
const
int64_t
offset_dst
=
((
batch
*
OH
+
oh
)
*
OW
+
ix
)
*
CHW
+
(
ic
*
(
KW
*
KH
)
+
ky
*
KW
+
kx
);
if
(
iih
<
0
||
iih
>=
IH
||
iiw
<
0
||
iiw
>=
IW
)
{
dst
[
offset_dst
]
=
0.0
f
;
}
else
{
const
int64_t
offset_src
=
ic
*
offset_delta
+
batch
*
batch_offset
;
dst
[
offset_dst
]
=
x
[
offset_src
+
iih
*
IW
+
iiw
];
}
}
template
<
typename
T
>
static
void
im2col_cuda
(
const
float
*
x
,
T
*
dst
,
int64_t
IW
,
int64_t
IH
,
int64_t
OW
,
int64_t
OH
,
int64_t
KW
,
int64_t
KH
,
int64_t
IC
,
int64_t
batch
,
int64_t
batch_offset
,
int64_t
offset_delta
,
int
s0
,
int
s1
,
int
p0
,
int
p1
,
int
d0
,
int
d1
,
cudaStream_t
stream
)
{
const
int
parallel_elements
=
OW
*
KW
*
KH
;
const
int
num_blocks
=
(
parallel_elements
+
CUDA_IM2COL_BLOCK_SIZE
-
1
)
/
CUDA_IM2COL_BLOCK_SIZE
;
dim3
block_nums
(
num_blocks
,
OH
,
batch
*
IC
);
im2col_kernel
<<<
block_nums
,
CUDA_IM2COL_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
batch_offset
,
offset_delta
,
IC
,
IW
,
IH
,
OH
,
OW
,
KW
,
KH
,
parallel_elements
,
(
IC
*
KH
*
KW
),
s0
,
s1
,
p0
,
p1
,
d0
,
d1
);
}
static
void
im2col_cuda_f16
(
const
float
*
x
,
half
*
dst
,
int64_t
IW
,
int64_t
IH
,
int64_t
OW
,
int64_t
OH
,
int64_t
KW
,
int64_t
KH
,
int64_t
IC
,
int64_t
batch
,
int64_t
batch_offset
,
int64_t
offset_delta
,
int
s0
,
int
s1
,
int
p0
,
int
p1
,
int
d0
,
int
d1
,
cudaStream_t
stream
)
{
im2col_cuda
<
half
>
(
x
,
dst
,
IW
,
IH
,
OW
,
OH
,
KW
,
KH
,
IC
,
batch
,
batch_offset
,
offset_delta
,
s0
,
s1
,
p0
,
p1
,
d0
,
d1
,
stream
);
}
static
void
im2col_cuda_f32
(
const
float
*
x
,
float
*
dst
,
int64_t
IW
,
int64_t
IH
,
int64_t
OW
,
int64_t
OH
,
int64_t
KW
,
int64_t
KH
,
int64_t
IC
,
int64_t
batch
,
int64_t
batch_offset
,
int64_t
offset_delta
,
int
s0
,
int
s1
,
int
p0
,
int
p1
,
int
d0
,
int
d1
,
cudaStream_t
stream
)
{
im2col_cuda
<
float
>
(
x
,
dst
,
IW
,
IH
,
OW
,
OH
,
KW
,
KH
,
IC
,
batch
,
batch_offset
,
offset_delta
,
s0
,
s1
,
p0
,
p1
,
d0
,
d1
,
stream
);
}
void
ggml_cuda_op_im2col
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
const
float
*
src1_d
=
(
const
float
*
)
src1
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F16
);
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F16
||
dst
->
type
==
GGML_TYPE_F32
);
const
int32_t
s0
=
((
const
int32_t
*
)(
dst
->
op_params
))[
0
];
const
int32_t
s1
=
((
const
int32_t
*
)(
dst
->
op_params
))[
1
];
const
int32_t
p0
=
((
const
int32_t
*
)(
dst
->
op_params
))[
2
];
const
int32_t
p1
=
((
const
int32_t
*
)(
dst
->
op_params
))[
3
];
const
int32_t
d0
=
((
const
int32_t
*
)(
dst
->
op_params
))[
4
];
const
int32_t
d1
=
((
const
int32_t
*
)(
dst
->
op_params
))[
5
];
const
bool
is_2D
=
((
const
int32_t
*
)(
dst
->
op_params
))[
6
]
==
1
;
const
int64_t
IC
=
src1
->
ne
[
is_2D
?
2
:
1
];
const
int64_t
IH
=
is_2D
?
src1
->
ne
[
1
]
:
1
;
const
int64_t
IW
=
src1
->
ne
[
0
];
const
int64_t
KH
=
is_2D
?
src0
->
ne
[
1
]
:
1
;
const
int64_t
KW
=
src0
->
ne
[
0
];
const
int64_t
OH
=
is_2D
?
dst
->
ne
[
2
]
:
1
;
const
int64_t
OW
=
dst
->
ne
[
1
];
const
size_t
delta_offset
=
src1
->
nb
[
is_2D
?
2
:
1
]
/
4
;
// nb is byte offset, src is type float32
const
int64_t
batch
=
src1
->
ne
[
3
];
const
size_t
batch_offset
=
src1
->
nb
[
3
]
/
4
;
// nb is byte offset, src is type float32
if
(
dst
->
type
==
GGML_TYPE_F16
)
{
im2col_cuda_f16
(
src1_d
,
(
half
*
)
dst_d
,
IW
,
IH
,
OW
,
OH
,
KW
,
KH
,
IC
,
batch
,
batch_offset
,
delta_offset
,
s0
,
s1
,
p0
,
p1
,
d0
,
d1
,
stream
);
}
else
{
im2col_cuda_f32
(
src1_d
,
(
float
*
)
dst_d
,
IW
,
IH
,
OW
,
OH
,
KW
,
KH
,
IC
,
batch
,
batch_offset
,
delta_offset
,
s0
,
s1
,
p0
,
p1
,
d0
,
d1
,
stream
);
}
}
llm/llama.cpp/ggml-cuda/im2col.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
#define CUDA_IM2COL_BLOCK_SIZE 256
void
ggml_cuda_op_im2col
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
llm/llama.cpp/ggml-cuda/mmq.cu
deleted
100644 → 0
View file @
97b02a89
#include "mmq.cuh"
#include "vecdotq.cuh"
typedef
void
(
*
allocate_tiles_cuda_t
)(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
);
typedef
void
(
*
load_tiles_cuda_t
)(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
);
typedef
float
(
*
vec_dot_q_mul_mat_cuda_t
)(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ms
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
);
typedef
void
(
*
dot_kernel_k_t
)(
const
void
*
__restrict__
vx
,
const
int
ib
,
const
int
iqs
,
const
float
*
__restrict__
y
,
float
&
v
);
typedef
void
(
mul_mat_q_t
)(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
);
struct
mmq_arch_config_t
{
int
x
;
int
y
;
int
nwarps
;
};
struct
mmq_config_t
{
mmq_arch_config_t
rdna2
;
mmq_arch_config_t
rdna1
;
mmq_arch_config_t
ampere
;
mmq_arch_config_t
pascal
;
};
constexpr
mmq_config_t
MMQ_CONFIG_Q4_0
=
{
// x y nwarps
{
64
,
128
,
8
},
{
64
,
64
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
64
,
128
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q4_1
=
{
// x y nwarps
{
64
,
128
,
8
},
{
64
,
64
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
64
,
128
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q5_0
=
{
// x y nwarps
{
64
,
128
,
8
},
{
64
,
64
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
128
,
64
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q5_1
=
{
// x y nwarps
{
64
,
128
,
8
},
{
64
,
64
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
128
,
64
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q8_0
=
{
// x y nwarps
{
64
,
128
,
8
},
{
64
,
64
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
128
,
64
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q2_K
=
{
// x y nwarps
{
64
,
128
,
8
},
{
128
,
32
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
64
,
128
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q3_K
=
{
// x y nwarps
{
128
,
64
,
8
},
{
32
,
128
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
128
,
128
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q4_K
=
{
// x y nwarps
{
64
,
128
,
8
},
{
32
,
64
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
64
,
128
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q5_K
=
{
// x y nwarps
{
64
,
128
,
8
},
{
32
,
64
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
64
,
128
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
constexpr
mmq_config_t
MMQ_CONFIG_Q6_K
=
{
// x y nwarps
{
64
,
128
,
8
},
{
32
,
64
,
8
},
#ifdef CUDA_USE_TENSOR_CORES
{
4
,
32
,
4
},
#else
{
64
,
64
,
4
},
#endif // CUDA_USE_TENSOR_CORES
{
64
,
64
,
8
},
};
// ------------------------------------------------------------
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q4_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
__shared__
int
tile_x_qs
[
mmq_y
*
(
WARP_SIZE
)
+
mmq_y
];
__shared__
float
tile_x_d
[
mmq_y
*
(
WARP_SIZE
/
QI4_0
)
+
mmq_y
/
QI4_0
];
*
x_ql
=
tile_x_qs
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI4_0
;
const
int
kqsx
=
k
%
QI4_0
;
const
block_q4_0
*
bx0
=
(
const
block_q4_0
*
)
vx
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
// x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI4_0
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI4_0
)
{
int
i
=
i0
+
i_offset
*
QI4_0
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dmf
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
kbxd
]
=
bxi
->
d
;
}
}
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
float
*
x_dmf
=
(
const
float
*
)
x_dm
;
int
u
[
2
*
VDR_Q4_0_Q8_1_MMQ
];
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q4_0_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
WARP_SIZE
+
(
kyqs
+
l
)
%
WARP_SIZE
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
WARP_SIZE
+
(
kyqs
+
l
+
QI4_0
)
%
WARP_SIZE
];
}
return
vec_dot_q4_0_q8_1_impl
<
VDR_Q4_0_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
u
,
x_dmf
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
k
/
QI4_0
],
y_ds
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
(
2
*
k
/
QI8_1
)
%
(
WARP_SIZE
/
QI8_1
)]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q4_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
__shared__
int
tile_x_qs
[
mmq_y
*
(
WARP_SIZE
)
+
+
mmq_y
];
__shared__
half2
tile_x_dm
[
mmq_y
*
(
WARP_SIZE
/
QI4_1
)
+
mmq_y
/
QI4_1
];
*
x_ql
=
tile_x_qs
;
*
x_dm
=
tile_x_dm
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_1
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI4_1
;
const
int
kqsx
=
k
%
QI4_1
;
const
block_q4_1
*
bx0
=
(
const
block_q4_1
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI4_1
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI4_1
)
{
int
i
=
i0
+
i_offset
*
QI4_1
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI4_1
)
+
i
/
QI4_1
+
kbxd
]
=
bxi
->
dm
;
}
}
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
int
u
[
2
*
VDR_Q4_1_Q8_1_MMQ
];
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q4_1_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
WARP_SIZE
+
(
kyqs
+
l
)
%
WARP_SIZE
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
WARP_SIZE
+
(
kyqs
+
l
+
QI4_1
)
%
WARP_SIZE
];
}
return
vec_dot_q4_1_q8_1_impl
<
VDR_Q4_1_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
u
,
x_dm
[
i
*
(
WARP_SIZE
/
QI4_1
)
+
i
/
QI4_1
+
k
/
QI4_1
],
y_ds
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
(
2
*
k
/
QI8_1
)
%
(
WARP_SIZE
/
QI8_1
)]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q5_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
__shared__
int
tile_x_ql
[
mmq_y
*
(
2
*
WARP_SIZE
)
+
mmq_y
];
__shared__
float
tile_x_d
[
mmq_y
*
(
WARP_SIZE
/
QI5_0
)
+
mmq_y
/
QI5_0
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI5_0
;
const
int
kqsx
=
k
%
QI5_0
;
const
block_q5_0
*
bx0
=
(
const
block_q5_0
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
int
ql
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
const
int
qh
=
get_int_from_uint8
(
bxi
->
qh
,
0
)
>>
(
4
*
(
k
%
QI5_0
));
int
qs0
=
(
ql
>>
0
)
&
0x0F0F0F0F
;
qs0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
qs0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
qs0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
qs0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
qs0
=
__vsubss4
(
qs0
,
0x10101010
);
// subtract 16
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
0
]
=
qs0
;
int
qs1
=
(
ql
>>
4
)
&
0x0F0F0F0F
;
qs1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
qs1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
qs1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
qs1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
qs1
=
__vsubss4
(
qs1
,
0x10101010
);
// subtract 16
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
1
]
=
qs1
;
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_0
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI5_0
)
{
int
i
=
i0
+
i_offset
*
QI5_0
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dmf
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
]
=
bxi
->
d
;
}
}
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
k
/
QI5_0
;
const
float
*
x_dmf
=
(
const
float
*
)
x_dm
;
const
float
*
y_df
=
(
const
float
*
)
y_ds
;
int
u
[
2
*
VDR_Q5_0_Q8_1_MMQ
];
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q5_0_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
WARP_SIZE
+
(
kyqs
+
l
)
%
WARP_SIZE
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
WARP_SIZE
+
(
kyqs
+
l
+
QI5_0
)
%
WARP_SIZE
];
}
return
vec_dot_q8_0_q8_1_impl
<
QR5_0
*
VDR_Q5_0_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
],
u
,
x_dmf
[
index_bx
],
y_df
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
(
2
*
k
/
QI8_1
)
%
(
WARP_SIZE
/
QI8_1
)]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q5_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
__shared__
int
tile_x_ql
[
mmq_y
*
(
2
*
WARP_SIZE
)
+
mmq_y
];
__shared__
half2
tile_x_dm
[
mmq_y
*
(
WARP_SIZE
/
QI5_1
)
+
mmq_y
/
QI5_1
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_1
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI5_1
;
const
int
kqsx
=
k
%
QI5_1
;
const
block_q5_1
*
bx0
=
(
const
block_q5_1
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
int
ql
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
const
int
qh
=
get_int_from_uint8_aligned
(
bxi
->
qh
,
0
)
>>
(
4
*
(
k
%
QI5_1
));
int
qs0
=
(
ql
>>
0
)
&
0x0F0F0F0F
;
qs0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
qs0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
qs0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
qs0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
0
]
=
qs0
;
int
qs1
=
(
ql
>>
4
)
&
0x0F0F0F0F
;
qs1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
qs1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
qs1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
qs1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
1
]
=
qs1
;
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_1
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI5_1
)
{
int
i
=
i0
+
i_offset
*
QI5_1
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI5_1
)
+
i
/
QI5_1
+
kbxd
]
=
bxi
->
dm
;
}
}
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_1
)
+
+
i
/
QI5_1
+
k
/
QI5_1
;
int
u
[
2
*
VDR_Q5_1_Q8_1_MMQ
];
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q5_1_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
WARP_SIZE
+
(
kyqs
+
l
)
%
WARP_SIZE
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
WARP_SIZE
+
(
kyqs
+
l
+
QI5_1
)
%
WARP_SIZE
];
}
return
vec_dot_q8_1_q8_1_impl
<
QR5_1
*
VDR_Q5_1_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
],
u
,
x_dm
[
index_bx
],
y_ds
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
(
2
*
k
/
QI8_1
)
%
(
WARP_SIZE
/
QI8_1
)]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q8_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
__shared__
int
tile_x_qs
[
mmq_y
*
(
WARP_SIZE
)
+
mmq_y
];
__shared__
float
tile_x_d
[
mmq_y
*
(
WARP_SIZE
/
QI8_0
)
+
mmq_y
/
QI8_0
];
*
x_ql
=
tile_x_qs
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q8_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI8_0
;
const
int
kqsx
=
k
%
QI8_0
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
const
block_q8_0
*
bx0
=
(
const
block_q8_0
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q8_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_int8
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI8_0
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI8_0
)
{
int
i
=
i0
+
i_offset
*
QI8_0
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q8_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dmf
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
kbxd
]
=
bxi
->
d
;
}
}
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
GGML_UNUSED
(
x_sc
);
const
float
*
x_dmf
=
(
const
float
*
)
x_dm
;
const
float
*
y_df
=
(
const
float
*
)
y_ds
;
return
vec_dot_q8_0_q8_1_impl
<
VDR_Q8_0_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
&
y_qs
[
j
*
WARP_SIZE
+
k
],
x_dmf
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
k
/
QI8_0
],
y_df
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
k
/
QI8_1
]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q2_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
__shared__
int
tile_x_ql
[
mmq_y
*
(
WARP_SIZE
)
+
mmq_y
];
__shared__
half2
tile_x_dm
[
mmq_y
*
(
WARP_SIZE
/
QI2_K
)
+
mmq_y
/
QI2_K
];
__shared__
int
tile_x_sc
[
mmq_y
*
(
WARP_SIZE
/
4
)
+
mmq_y
/
4
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_sc
=
tile_x_sc
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q2_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI2_K
;
const
int
kqsx
=
k
%
QI2_K
;
const
block_q2_K
*
bx0
=
(
const
block_q2_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI2_K
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI2_K
)
{
int
i
=
(
i0
+
i_offset
*
QI2_K
+
k
/
blocks_per_tile_x_row
)
%
mmq_y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI2_K
)
+
i
/
QI2_K
+
kbxd
]
=
bxi
->
dm
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
4
)
{
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI2_K
/
4
);
x_sc
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
k
%
(
WARP_SIZE
/
4
)]
=
get_int_from_uint8_aligned
(
bxi
->
scales
,
k
%
(
QI2_K
/
4
));
}
}
static
__device__
__forceinline__
float
vec_dot_q2_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
const
int
kbx
=
k
/
QI2_K
;
const
int
ky
=
(
k
%
QI2_K
)
*
QR2_K
;
const
float
*
y_df
=
(
const
float
*
)
y_ds
;
int
v
[
QR2_K
*
VDR_Q2_K_Q8_1_MMQ
];
const
int
kqsx
=
i
*
(
WARP_SIZE
+
1
)
+
kbx
*
QI2_K
+
(
QI2_K
/
2
)
*
(
ky
/
(
2
*
QI2_K
))
+
ky
%
(
QI2_K
/
2
);
const
int
shift
=
2
*
((
ky
%
(
2
*
QI2_K
))
/
(
QI2_K
/
2
));
#pragma unroll
for
(
int
l
=
0
;
l
<
QR2_K
*
VDR_Q2_K_Q8_1_MMQ
;
++
l
)
{
v
[
l
]
=
(
x_ql
[
kqsx
+
l
]
>>
shift
)
&
0x03030303
;
}
const
uint8_t
*
scales
=
((
const
uint8_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
kbx
*
4
])
+
ky
/
4
;
const
int
index_y
=
j
*
WARP_SIZE
+
(
QR2_K
*
k
)
%
WARP_SIZE
;
return
vec_dot_q2_K_q8_1_impl_mmq
(
v
,
&
y_qs
[
index_y
],
scales
,
x_dm
[
i
*
(
WARP_SIZE
/
QI2_K
)
+
i
/
QI2_K
+
kbx
],
y_df
[
index_y
/
QI8_1
]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q3_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
mmq_y
*
(
WARP_SIZE
)
+
mmq_y
];
__shared__
half2
tile_x_dm
[
mmq_y
*
(
WARP_SIZE
/
QI3_K
)
+
mmq_y
/
QI3_K
];
__shared__
int
tile_x_qh
[
mmq_y
*
(
WARP_SIZE
/
2
)
+
mmq_y
/
2
];
__shared__
int
tile_x_sc
[
mmq_y
*
(
WARP_SIZE
/
4
)
+
mmq_y
/
4
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_qh
=
tile_x_qh
;
*
x_sc
=
tile_x_sc
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q3_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI3_K
;
const
int
kqsx
=
k
%
QI3_K
;
const
block_q3_K
*
bx0
=
(
const
block_q3_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI3_K
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI3_K
)
{
int
i
=
(
i0
+
i_offset
*
QI3_K
+
k
/
blocks_per_tile_x_row
)
%
mmq_y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dmf
[
i
*
(
WARP_SIZE
/
QI3_K
)
+
i
/
QI3_K
+
kbxd
]
=
bxi
->
d
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
2
)
{
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI3_K
/
2
);
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
x_qh
[
i
*
(
WARP_SIZE
/
2
)
+
i
/
2
+
k
%
(
WARP_SIZE
/
2
)]
=
~
get_int_from_uint8
(
bxi
->
hmask
,
k
%
(
QI3_K
/
2
));
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
4
)
{
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI3_K
/
4
);
const
int
ksc
=
k
%
(
QI3_K
/
4
);
const
int
ksc_low
=
ksc
%
(
QI3_K
/
8
);
const
int
shift_low
=
4
*
(
ksc
/
(
QI3_K
/
8
));
const
int
sc_low
=
(
get_int_from_uint8
(
bxi
->
scales
,
ksc_low
)
>>
shift_low
)
&
0x0F0F0F0F
;
const
int
ksc_high
=
QI3_K
/
8
;
const
int
shift_high
=
2
*
ksc
;
const
int
sc_high
=
((
get_int_from_uint8
(
bxi
->
scales
,
ksc_high
)
>>
shift_high
)
<<
4
)
&
0x30303030
;
const
int
sc
=
__vsubss4
(
sc_low
|
sc_high
,
0x20202020
);
x_sc
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
k
%
(
WARP_SIZE
/
4
)]
=
sc
;
}
}
static
__device__
__forceinline__
float
vec_dot_q3_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
const
int
kbx
=
k
/
QI3_K
;
const
int
ky
=
(
k
%
QI3_K
)
*
QR3_K
;
const
float
*
x_dmf
=
(
const
float
*
)
x_dm
;
const
float
*
y_df
=
(
const
float
*
)
y_ds
;
const
int8_t
*
scales
=
((
const
int8_t
*
)
(
x_sc
+
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
kbx
*
4
))
+
ky
/
4
;
int
v
[
QR3_K
*
VDR_Q3_K_Q8_1_MMQ
];
#pragma unroll
for
(
int
l
=
0
;
l
<
QR3_K
*
VDR_Q3_K_Q8_1_MMQ
;
++
l
)
{
const
int
kqsx
=
i
*
(
WARP_SIZE
+
1
)
+
kbx
*
QI3_K
+
(
QI3_K
/
2
)
*
(
ky
/
(
2
*
QI3_K
))
+
ky
%
(
QI3_K
/
2
);
const
int
shift
=
2
*
((
ky
%
32
)
/
8
);
const
int
vll
=
(
x_ql
[
kqsx
+
l
]
>>
shift
)
&
0x03030303
;
const
int
vh
=
x_qh
[
i
*
(
WARP_SIZE
/
2
)
+
i
/
2
+
kbx
*
(
QI3_K
/
2
)
+
(
ky
+
l
)
%
8
]
>>
((
ky
+
l
)
/
8
);
const
int
vlh
=
(
vh
<<
2
)
&
0x04040404
;
v
[
l
]
=
__vsubss4
(
vll
,
vlh
);
}
const
int
index_y
=
j
*
WARP_SIZE
+
(
k
*
QR3_K
)
%
WARP_SIZE
;
return
vec_dot_q3_K_q8_1_impl_mmq
(
v
,
&
y_qs
[
index_y
],
scales
,
x_dmf
[
i
*
(
WARP_SIZE
/
QI3_K
)
+
i
/
QI3_K
+
kbx
],
y_df
[
index_y
/
QI8_1
]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q4_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
__shared__
int
tile_x_ql
[
mmq_y
*
(
WARP_SIZE
)
+
mmq_y
];
__shared__
half2
tile_x_dm
[
mmq_y
*
(
WARP_SIZE
/
QI4_K
)
+
mmq_y
/
QI4_K
];
__shared__
int
tile_x_sc
[
mmq_y
*
(
WARP_SIZE
/
8
)
+
mmq_y
/
8
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_sc
=
tile_x_sc
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI4_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI4_K
;
// == k if QK_K == 256
const
block_q4_K
*
bx0
=
(
const
block_q4_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI4_K
;
// == 1 if QK_K == 256
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
// == 0 if QK_K == 256
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI4_K
)
{
int
i
=
(
i0
+
i_offset
*
QI4_K
+
k
/
blocks_per_tile_x_row
)
%
mmq_y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI4_K
)
+
i
/
QI4_K
+
kbxd
]
=
bxi
->
dm
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
8
)
{
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
mmq_y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI4_K
/
8
);
const
int
*
scales
=
(
const
int
*
)
bxi
->
scales
;
const
int
ksc
=
k
%
(
WARP_SIZE
/
8
);
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
int
scales8
=
(
scales
[(
ksc
%
2
)
+
(
ksc
!=
0
)]
>>
(
4
*
(
ksc
&
(
ksc
/
2
))))
&
0x0F0F0F0F
;
// lower 4 bits
scales8
|=
(
scales
[
ksc
/
2
]
>>
(
2
*
(
ksc
%
2
)))
&
0x30303030
;
// upper 2 bits
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
ksc
]
=
scales8
;
}
}
static
__device__
__forceinline__
float
vec_dot_q4_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
const
uint8_t
*
sc
=
((
const
uint8_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
/
16
])
+
2
*
((
k
%
16
)
/
8
);
const
int
index_y
=
j
*
WARP_SIZE
+
(
QR4_K
*
k
)
%
WARP_SIZE
;
return
vec_dot_q4_K_q8_1_impl_mmq
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
&
y_qs
[
index_y
],
sc
,
sc
+
8
,
x_dm
[
i
*
(
WARP_SIZE
/
QI4_K
)
+
i
/
QI4_K
],
&
y_ds
[
index_y
/
QI8_1
]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q5_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
__shared__
int
tile_x_ql
[
mmq_y
*
(
2
*
WARP_SIZE
)
+
mmq_y
];
__shared__
half2
tile_x_dm
[
mmq_y
*
(
WARP_SIZE
/
QI5_K
)
+
mmq_y
/
QI5_K
];
__shared__
int
tile_x_sc
[
mmq_y
*
(
WARP_SIZE
/
8
)
+
mmq_y
/
8
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_sc
=
tile_x_sc
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI5_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI5_K
;
// == k if QK_K == 256
const
block_q5_K
*
bx0
=
(
const
block_q5_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
int
ky
=
QR5_K
*
kqsx
;
const
int
ql
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
const
int
ql0
=
(
ql
>>
0
)
&
0x0F0F0F0F
;
const
int
ql1
=
(
ql
>>
4
)
&
0x0F0F0F0F
;
const
int
qh
=
get_int_from_uint8_aligned
(
bxi
->
qh
,
kqsx
%
(
QI5_K
/
4
));
const
int
qh0
=
((
qh
>>
(
2
*
(
kqsx
/
(
QI5_K
/
4
))
+
0
))
<<
4
)
&
0x10101010
;
const
int
qh1
=
((
qh
>>
(
2
*
(
kqsx
/
(
QI5_K
/
4
))
+
1
))
<<
4
)
&
0x10101010
;
const
int
kq0
=
ky
-
ky
%
(
QI5_K
/
2
)
+
k
%
(
QI5_K
/
4
)
+
0
;
const
int
kq1
=
ky
-
ky
%
(
QI5_K
/
2
)
+
k
%
(
QI5_K
/
4
)
+
(
QI5_K
/
4
);
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
kq0
]
=
ql0
|
qh0
;
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
kq1
]
=
ql1
|
qh1
;
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_K
;
// == 1 if QK_K == 256
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
// == 0 if QK_K == 256
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI5_K
)
{
int
i
=
(
i0
+
i_offset
*
QI5_K
+
k
/
blocks_per_tile_x_row
)
%
mmq_y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dm
[
i
*
(
WARP_SIZE
/
QI5_K
)
+
i
/
QI5_K
+
kbxd
]
=
bxi
->
dm
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
8
)
{
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
mmq_y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI5_K
/
8
);
const
int
*
scales
=
(
const
int
*
)
bxi
->
scales
;
const
int
ksc
=
k
%
(
WARP_SIZE
/
8
);
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
int
scales8
=
(
scales
[(
ksc
%
2
)
+
(
ksc
!=
0
)]
>>
(
4
*
(
ksc
&
(
ksc
/
2
))))
&
0x0F0F0F0F
;
// lower 4 bits
scales8
|=
(
scales
[
ksc
/
2
]
>>
(
2
*
(
ksc
%
2
)))
&
0x30303030
;
// upper 2 bits
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
ksc
]
=
scales8
;
}
}
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
const
uint8_t
*
sc
=
((
const
uint8_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
/
16
])
+
2
*
((
k
%
16
)
/
8
);
const
int
index_x
=
i
*
(
QR5_K
*
WARP_SIZE
+
1
)
+
QR5_K
*
k
;
const
int
index_y
=
j
*
WARP_SIZE
+
(
QR5_K
*
k
)
%
WARP_SIZE
;
return
vec_dot_q5_K_q8_1_impl_mmq
(
&
x_ql
[
index_x
],
&
y_qs
[
index_y
],
sc
,
sc
+
8
,
x_dm
[
i
*
(
WARP_SIZE
/
QI5_K
)
+
i
/
QI5_K
],
&
y_ds
[
index_y
/
QI8_1
]);
}
template
<
int
mmq_y
>
static
__device__
__forceinline__
void
allocate_tiles_q6_K
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
GGML_UNUSED
(
x_qh
);
__shared__
int
tile_x_ql
[
mmq_y
*
(
2
*
WARP_SIZE
)
+
mmq_y
];
__shared__
half2
tile_x_dm
[
mmq_y
*
(
WARP_SIZE
/
QI6_K
)
+
mmq_y
/
QI6_K
];
__shared__
int
tile_x_sc
[
mmq_y
*
(
WARP_SIZE
/
8
)
+
mmq_y
/
8
];
*
x_ql
=
tile_x_ql
;
*
x_dm
=
tile_x_dm
;
*
x_sc
=
tile_x_sc
;
}
template
<
int
mmq_y
,
int
nwarps
,
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q6_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
GGML_UNUSED
(
x_qh
);
GGML_CUDA_ASSUME
(
i_offset
>=
0
);
GGML_CUDA_ASSUME
(
i_offset
<
nwarps
);
GGML_CUDA_ASSUME
(
k
>=
0
);
GGML_CUDA_ASSUME
(
k
<
WARP_SIZE
);
const
int
kbx
=
k
/
QI6_K
;
// == 0 if QK_K == 256
const
int
kqsx
=
k
%
QI6_K
;
// == k if QK_K == 256
const
block_q6_K
*
bx0
=
(
const
block_q6_K
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
)
{
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
int
ky
=
QR6_K
*
kqsx
;
const
int
ql
=
get_int_from_uint8
(
bxi
->
ql
,
kqsx
);
const
int
ql0
=
(
ql
>>
0
)
&
0x0F0F0F0F
;
const
int
ql1
=
(
ql
>>
4
)
&
0x0F0F0F0F
;
const
int
qh
=
get_int_from_uint8
(
bxi
->
qh
,
(
QI6_K
/
4
)
*
(
kqsx
/
(
QI6_K
/
2
))
+
kqsx
%
(
QI6_K
/
4
));
const
int
qh0
=
((
qh
>>
(
2
*
((
kqsx
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
))))
<<
4
)
&
0x30303030
;
const
int
qh1
=
(
qh
>>
(
2
*
((
kqsx
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
))))
&
0x30303030
;
const
int
kq0
=
ky
-
ky
%
QI6_K
+
k
%
(
QI6_K
/
2
)
+
0
;
const
int
kq1
=
ky
-
ky
%
QI6_K
+
k
%
(
QI6_K
/
2
)
+
(
QI6_K
/
2
);
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
kq0
]
=
__vsubss4
(
ql0
|
qh0
,
0x20202020
);
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
kq1
]
=
__vsubss4
(
ql1
|
qh1
,
0x20202020
);
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI6_K
;
// == 1 if QK_K == 256
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
// == 0 if QK_K == 256
float
*
x_dmf
=
(
float
*
)
x_dm
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
QI6_K
)
{
int
i
=
(
i0
+
i_offset
*
QI6_K
+
k
/
blocks_per_tile_x_row
)
%
mmq_y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_dmf
[
i
*
(
WARP_SIZE
/
QI6_K
)
+
i
/
QI6_K
+
kbxd
]
=
bxi
->
d
;
}
#pragma unroll
for
(
int
i0
=
0
;
i0
<
mmq_y
;
i0
+=
nwarps
*
8
)
{
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
mmq_y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
4
;
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
%
(
WARP_SIZE
/
8
)]
=
get_int_from_int8
(
bxi
->
scales
,
k
%
(
QI6_K
/
8
));
}
}
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1_mul_mat
(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ds
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
)
{
GGML_UNUSED
(
x_qh
);
const
float
*
x_dmf
=
(
const
float
*
)
x_dm
;
const
float
*
y_df
=
(
const
float
*
)
y_ds
;
const
int8_t
*
sc
=
((
const
int8_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
k
/
8
]);
const
int
index_x
=
i
*
(
QR6_K
*
WARP_SIZE
+
1
)
+
QR6_K
*
k
;
const
int
index_y
=
j
*
WARP_SIZE
+
(
QR6_K
*
k
)
%
WARP_SIZE
;
return
vec_dot_q6_K_q8_1_impl_mmq
(
&
x_ql
[
index_x
],
&
y_qs
[
index_y
],
sc
,
x_dmf
[
i
*
(
WARP_SIZE
/
QI6_K
)
+
i
/
QI6_K
],
&
y_df
[
index_y
/
QI8_1
]);
}
template
<
int
qk
,
int
qr
,
int
qi
,
bool
need_sum
,
typename
block_q_t
,
int
mmq_x
,
int
mmq_y
,
int
nwarps
,
allocate_tiles_cuda_t
allocate_tiles
,
load_tiles_cuda_t
load_tiles
,
int
vdr
,
vec_dot_q_mul_mat_cuda_t
vec_dot
>
static
__device__
__forceinline__
void
mul_mat_q
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
const
block_q_t
*
x
=
(
const
block_q_t
*
)
vx
;
const
block_q8_1
*
y
=
(
const
block_q8_1
*
)
vy
;
const
int
blocks_per_row_x
=
ncols_x
/
qk
;
const
int
blocks_per_col_y
=
nrows_y
/
QK8_1
;
const
int
blocks_per_warp
=
WARP_SIZE
/
qi
;
const
int
&
ncols_dst
=
ncols_y
;
const
int
row_dst_0
=
blockIdx
.
x
*
mmq_y
;
const
int
&
row_x_0
=
row_dst_0
;
const
int
col_dst_0
=
blockIdx
.
y
*
mmq_x
;
const
int
&
col_y_0
=
col_dst_0
;
int
*
tile_x_ql
=
nullptr
;
half2
*
tile_x_dm
=
nullptr
;
int
*
tile_x_qh
=
nullptr
;
int
*
tile_x_sc
=
nullptr
;
allocate_tiles
(
&
tile_x_ql
,
&
tile_x_dm
,
&
tile_x_qh
,
&
tile_x_sc
);
__shared__
int
tile_y_qs
[
mmq_x
*
WARP_SIZE
];
__shared__
half2
tile_y_ds
[
mmq_x
*
WARP_SIZE
/
QI8_1
];
float
sum
[
mmq_y
/
WARP_SIZE
][
mmq_x
/
nwarps
]
=
{{
0.0
f
}};
for
(
int
ib0
=
0
;
ib0
<
blocks_per_row_x
;
ib0
+=
blocks_per_warp
)
{
load_tiles
(
x
+
row_x_0
*
blocks_per_row_x
+
ib0
,
tile_x_ql
,
tile_x_dm
,
tile_x_qh
,
tile_x_sc
,
threadIdx
.
y
,
nrows_x
-
row_x_0
-
1
,
threadIdx
.
x
,
blocks_per_row_x
);
#pragma unroll
for
(
int
ir
=
0
;
ir
<
qr
;
++
ir
)
{
const
int
kqs
=
ir
*
WARP_SIZE
+
threadIdx
.
x
;
const
int
kbxd
=
kqs
/
QI8_1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
mmq_x
;
i
+=
nwarps
)
{
const
int
col_y_eff
=
min
(
col_y_0
+
threadIdx
.
y
+
i
,
ncols_y
-
1
);
// to prevent out-of-bounds memory accesses
const
block_q8_1
*
by0
=
&
y
[
col_y_eff
*
blocks_per_col_y
+
ib0
*
(
qk
/
QK8_1
)
+
kbxd
];
const
int
index_y
=
(
threadIdx
.
y
+
i
)
*
WARP_SIZE
+
kqs
%
WARP_SIZE
;
tile_y_qs
[
index_y
]
=
get_int_from_int8_aligned
(
by0
->
qs
,
threadIdx
.
x
%
QI8_1
);
}
#pragma unroll
for
(
int
ids0
=
0
;
ids0
<
mmq_x
;
ids0
+=
nwarps
*
QI8_1
)
{
const
int
ids
=
(
ids0
+
threadIdx
.
y
*
QI8_1
+
threadIdx
.
x
/
(
WARP_SIZE
/
QI8_1
))
%
mmq_x
;
const
int
kby
=
threadIdx
.
x
%
(
WARP_SIZE
/
QI8_1
);
const
int
col_y_eff
=
min
(
col_y_0
+
ids
,
ncols_y
-
1
);
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
const
half2
*
dsi_src
=
&
y
[
col_y_eff
*
blocks_per_col_y
+
ib0
*
(
qk
/
QK8_1
)
+
ir
*
(
WARP_SIZE
/
QI8_1
)
+
kby
].
ds
;
half2
*
dsi_dst
=
&
tile_y_ds
[
ids
*
(
WARP_SIZE
/
QI8_1
)
+
kby
];
if
(
need_sum
)
{
*
dsi_dst
=
*
dsi_src
;
}
else
{
float
*
dfi_dst
=
(
float
*
)
dsi_dst
;
*
dfi_dst
=
__low2float
(
*
dsi_src
);
}
}
__syncthreads
();
// #pragma unroll // unrolling this loop causes too much register pressure
for
(
int
k
=
ir
*
WARP_SIZE
/
qr
;
k
<
(
ir
+
1
)
*
WARP_SIZE
/
qr
;
k
+=
vdr
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
mmq_x
;
j
+=
nwarps
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
mmq_y
;
i
+=
WARP_SIZE
)
{
sum
[
i
/
WARP_SIZE
][
j
/
nwarps
]
+=
vec_dot
(
tile_x_ql
,
tile_x_dm
,
tile_x_qh
,
tile_x_sc
,
tile_y_qs
,
tile_y_ds
,
threadIdx
.
x
+
i
,
threadIdx
.
y
+
j
,
k
);
}
}
}
__syncthreads
();
}
}
#pragma unroll
for
(
int
j
=
0
;
j
<
mmq_x
;
j
+=
nwarps
)
{
const
int
col_dst
=
col_dst_0
+
j
+
threadIdx
.
y
;
if
(
col_dst
>=
ncols_dst
)
{
return
;
}
#pragma unroll
for
(
int
i
=
0
;
i
<
mmq_y
;
i
+=
WARP_SIZE
)
{
const
int
row_dst
=
row_dst_0
+
threadIdx
.
x
+
i
;
if
(
row_dst
>=
nrows_dst
)
{
continue
;
}
dst
[
col_dst
*
nrows_dst
+
row_dst
]
=
sum
[
i
/
WARP_SIZE
][
j
/
nwarps
];
}
}
}
static
constexpr
__device__
mmq_arch_config_t
get_arch_config_device
(
mmq_config_t
mmq_config
)
{
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
return
mmq_config
.
rdna2
;
#else
return
mmq_config
.
rdna1
;
#endif // defined(RDNA3) || defined(RDNA2)
#else
#if __CUDA_ARCH__ >= CC_VOLTA
return
mmq_config
.
ampere
;
#else
return
mmq_config
.
pascal
;
#endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q4_0
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q4_0
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q4_0
);
mul_mat_q
<
QK4_0
,
QR4_0
,
QI4_0
,
true
,
block_q4_0
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q4_0
<
arch_config
.
y
>
,
load_tiles_q4_0
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q4_0_Q8_1_MMQ
,
vec_dot_q4_0_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q4_0_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q4_1
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q4_1
.
pascal
.
nwarps
,
2
)
#endif // __CUDA_ARCH__ < CC_VOLTA
mul_mat_q4_1
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q4_1
);
mul_mat_q
<
QK4_1
,
QR4_1
,
QI4_1
,
true
,
block_q4_1
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q4_1
<
arch_config
.
y
>
,
load_tiles_q4_1
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q4_1_Q8_1_MMQ
,
vec_dot_q4_1_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q4_1_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q5_0
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_0
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q5_0
);
mul_mat_q
<
QK5_0
,
QR5_0
,
QI5_0
,
false
,
block_q5_0
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q5_0
<
arch_config
.
y
>
,
load_tiles_q5_0
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q5_0_Q8_1_MMQ
,
vec_dot_q5_0_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q5_0_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q5_1
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_1
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q5_1
);
mul_mat_q
<
QK5_1
,
QR5_1
,
QI5_1
,
true
,
block_q5_1
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q5_1
<
arch_config
.
y
>
,
load_tiles_q5_1
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q5_1_Q8_1_MMQ
,
vec_dot_q5_1_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q5_1_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q8_0
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q8_0
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q8_0
);
mul_mat_q
<
QK8_0
,
QR8_0
,
QI8_0
,
false
,
block_q8_0
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q8_0
<
arch_config
.
y
>
,
load_tiles_q8_0
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q8_0_Q8_1_MMQ
,
vec_dot_q8_0_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q8_0_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q2_K
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q2_K
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q2_K
);
mul_mat_q
<
QK_K
,
QR2_K
,
QI2_K
,
false
,
block_q2_K
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q2_K
<
arch_config
.
y
>
,
load_tiles_q2_K
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q2_K_Q8_1_MMQ
,
vec_dot_q2_K_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q2_K_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q3_K
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q3_K
.
pascal
.
nwarps
,
2
)
#endif // __CUDA_ARCH__ < CC_VOLTA
mul_mat_q3_K
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q3_K
);
mul_mat_q
<
QK_K
,
QR3_K
,
QI3_K
,
false
,
block_q3_K
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q3_K
<
arch_config
.
y
>
,
load_tiles_q3_K
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q3_K_Q8_1_MMQ
,
vec_dot_q3_K_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q3_K_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q4_K
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q4_K
.
pascal
.
nwarps
,
2
)
#endif // __CUDA_ARCH__ < CC_VOLTA
mul_mat_q4_K
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q4_K
);
mul_mat_q
<
QK_K
,
QR4_K
,
QI4_K
,
true
,
block_q4_K
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q4_K
<
arch_config
.
y
>
,
load_tiles_q4_K
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q4_K_Q8_1_MMQ
,
vec_dot_q4_K_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q4_K_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q5_K
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_K
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q5_K
);
mul_mat_q
<
QK_K
,
QR5_K
,
QI5_K
,
true
,
block_q5_K
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q5_K
<
arch_config
.
y
>
,
load_tiles_q5_K
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q5_K_Q8_1_MMQ
,
vec_dot_q5_K_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q5_K_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
bool
need_check
>
static
__global__
__launch_bounds__
(
1024
)
void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q6_K
.
rdna2
.
nwarps
,
2
)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
__launch_bounds__
(
WARP_SIZE
*
MMQ_CONFIG_Q4_K
.
pascal
.
nwarps
,
2
)
#endif // __CUDA_ARCH__ < CC_VOLTA
mul_mat_q6_K
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_y
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr
mmq_arch_config_t
arch_config
=
get_arch_config_device
(
MMQ_CONFIG_Q6_K
);
mul_mat_q
<
QK_K
,
QR6_K
,
QI6_K
,
false
,
block_q6_K
,
arch_config
.
x
,
arch_config
.
y
,
arch_config
.
nwarps
,
allocate_tiles_q6_K
<
arch_config
.
y
>
,
load_tiles_q6_K
<
arch_config
.
y
,
arch_config
.
nwarps
,
need_check
>
,
VDR_Q6_K_Q8_1_MMQ
,
vec_dot_q6_K_q8_1_mul_mat
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
#else
GGML_UNUSED
(
get_arch_config_device
);
GGML_UNUSED
(
vec_dot_q6_K_q8_1_mul_mat
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define MMQ_SWITCH_CASE(type_suffix) \
case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \
const bool need_check = false; \
mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
} else { \
const bool need_check = true; \
mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
} break; \
void
ggml_cuda_op_mul_mat_q
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
char
*
src0_dd_i
,
const
float
*
src1_ddf_i
,
const
char
*
src1_ddq_i
,
float
*
dst_dd_i
,
const
int64_t
row_low
,
const
int64_t
row_high
,
const
int64_t
src1_ncols
,
const
int64_t
src1_padded_row_size
,
cudaStream_t
stream
)
{
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
ne10
=
src1
->
ne
[
0
];
GGML_ASSERT
(
ne10
%
QK8_1
==
0
);
const
int64_t
ne0
=
dst
->
ne
[
0
];
const
int64_t
row_diff
=
row_high
-
row_low
;
int
id
=
ggml_cuda_get_device
();
const
int
compute_capability
=
ggml_cuda_info
().
devices
[
id
].
cc
;
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into
const
int64_t
nrows_dst
=
id
==
ctx
.
device
?
ne0
:
row_diff
;
mmq_config_t
mmq_config
;
switch
(
src0
->
type
)
{
case
GGML_TYPE_Q4_0
:
mmq_config
=
MMQ_CONFIG_Q4_0
;
break
;
case
GGML_TYPE_Q4_1
:
mmq_config
=
MMQ_CONFIG_Q4_1
;
break
;
case
GGML_TYPE_Q5_0
:
mmq_config
=
MMQ_CONFIG_Q5_0
;
break
;
case
GGML_TYPE_Q5_1
:
mmq_config
=
MMQ_CONFIG_Q5_1
;
break
;
case
GGML_TYPE_Q8_0
:
mmq_config
=
MMQ_CONFIG_Q8_0
;
break
;
case
GGML_TYPE_Q2_K
:
mmq_config
=
MMQ_CONFIG_Q2_K
;
break
;
case
GGML_TYPE_Q3_K
:
mmq_config
=
MMQ_CONFIG_Q3_K
;
break
;
case
GGML_TYPE_Q4_K
:
mmq_config
=
MMQ_CONFIG_Q4_K
;
break
;
case
GGML_TYPE_Q5_K
:
mmq_config
=
MMQ_CONFIG_Q5_K
;
break
;
case
GGML_TYPE_Q6_K
:
mmq_config
=
MMQ_CONFIG_Q6_K
;
break
;
default:
GGML_ASSERT
(
false
);
break
;
}
mmq_arch_config_t
arch_config
;
if
(
compute_capability
>=
CC_RDNA2
)
{
arch_config
=
mmq_config
.
rdna2
;
}
else
if
(
compute_capability
>=
CC_OFFSET_AMD
)
{
arch_config
=
mmq_config
.
rdna1
;
}
else
if
(
compute_capability
>=
CC_VOLTA
)
{
arch_config
=
mmq_config
.
ampere
;
}
else
if
(
compute_capability
>=
MIN_CC_DP4A
)
{
arch_config
=
mmq_config
.
pascal
;
}
else
{
GGML_ASSERT
(
false
);
}
const
int
block_num_x
=
(
row_diff
+
arch_config
.
y
-
1
)
/
arch_config
.
y
;
const
int
block_num_y
=
(
src1_ncols
+
arch_config
.
x
-
1
)
/
arch_config
.
x
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
arch_config
.
nwarps
,
1
);
switch
(
src0
->
type
)
{
MMQ_SWITCH_CASE
(
4
_0
)
MMQ_SWITCH_CASE
(
4
_1
)
MMQ_SWITCH_CASE
(
5
_0
)
MMQ_SWITCH_CASE
(
5
_1
)
MMQ_SWITCH_CASE
(
8
_0
)
MMQ_SWITCH_CASE
(
2
_K
)
MMQ_SWITCH_CASE
(
3
_K
)
MMQ_SWITCH_CASE
(
4
_K
)
MMQ_SWITCH_CASE
(
5
_K
)
MMQ_SWITCH_CASE
(
6
_K
)
default:
GGML_ASSERT
(
false
);
break
;
}
GGML_UNUSED
(
src1
);
GGML_UNUSED
(
dst
);
GGML_UNUSED
(
src1_ddf_i
);
}
bool
ggml_cuda_supports_mmq
(
enum
ggml_type
type
)
{
switch
(
type
)
{
case
GGML_TYPE_Q4_0
:
case
GGML_TYPE_Q4_1
:
case
GGML_TYPE_Q5_0
:
case
GGML_TYPE_Q5_1
:
case
GGML_TYPE_Q8_0
:
case
GGML_TYPE_Q2_K
:
case
GGML_TYPE_Q3_K
:
case
GGML_TYPE_Q4_K
:
case
GGML_TYPE_Q5_K
:
case
GGML_TYPE_Q6_K
:
return
true
;
default:
return
false
;
}
}
llm/llama.cpp/ggml-cuda/mmq.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
void
ggml_cuda_op_mul_mat_q
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
char
*
src0_dd_i
,
const
float
*
src1_ddf_i
,
const
char
*
src1_ddq_i
,
float
*
dst_dd_i
,
const
int64_t
row_low
,
const
int64_t
row_high
,
const
int64_t
src1_ncols
,
const
int64_t
src1_padded_row_size
,
cudaStream_t
stream
);
bool
ggml_cuda_supports_mmq
(
enum
ggml_type
type
);
llm/llama.cpp/ggml-cuda/mmvq.cu
deleted
100644 → 0
View file @
97b02a89
#include "mmvq.cuh"
#include "vecdotq.cuh"
typedef
float
(
*
vec_dot_q_cuda_t
)(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
iqs
);
template
<
int
ncols_y
,
int
qk
,
int
qi
,
typename
block_q_t
,
int
vdr
,
vec_dot_q_cuda_t
vec_dot_q_cuda
>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__
((
ncols_y
<=
4
?
4
:
2
)
*
WARP_SIZE
,
1
)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static
__global__
__launch_bounds__
(
1024
)
void
mul_mat_vec_q
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
constexpr
int
nwarps
=
1
;
constexpr
int
rows_per_cuda_block
=
1
;
#else
constexpr
int
nwarps
=
ncols_y
<=
4
?
4
:
2
;
constexpr
int
rows_per_cuda_block
=
ncols_y
==
1
?
1
:
2
;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
const
int
tid
=
WARP_SIZE
*
threadIdx
.
y
+
threadIdx
.
x
;
const
int
row0
=
rows_per_cuda_block
*
blockIdx
.
x
;
const
int
blocks_per_row_x
=
ncols_x
/
qk
;
const
int
blocks_per_col_y
=
nrows_y
/
QK8_1
;
constexpr
int
blocks_per_iter
=
vdr
*
nwarps
*
WARP_SIZE
/
qi
;
// partial sum for each thread
float
tmp
[
ncols_y
][
rows_per_cuda_block
]
=
{
0.0
f
};
const
block_q_t
*
x
=
(
const
block_q_t
*
)
vx
;
const
block_q8_1
*
y
=
(
const
block_q8_1
*
)
vy
;
for
(
int
kbx
=
tid
/
(
qi
/
vdr
);
kbx
<
blocks_per_row_x
;
kbx
+=
blocks_per_iter
)
{
const
int
kby
=
kbx
*
(
qk
/
QK8_1
);
// y block index that aligns with kbx
// x block quant index when casting the quants to int
const
int
kqs
=
vdr
*
(
tid
%
(
qi
/
vdr
));
#pragma unroll
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
tmp
[
j
][
i
]
+=
vec_dot_q_cuda
(
&
x
[
kbx
+
(
row0
+
i
)
*
blocks_per_row_x
],
&
y
[
j
*
blocks_per_col_y
+
kby
],
kqs
);
}
}
}
__shared__
float
tmp_shared
[
nwarps
-
1
>
0
?
nwarps
-
1
:
1
][
ncols_y
][
rows_per_cuda_block
][
WARP_SIZE
];
if
(
threadIdx
.
y
>
0
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
tmp_shared
[
threadIdx
.
y
-
1
][
j
][
i
][
threadIdx
.
x
]
=
tmp
[
j
][
i
];
}
}
}
__syncthreads
();
if
(
threadIdx
.
y
>
0
)
{
return
;
}
// sum up partial sums and write back result
#pragma unroll
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
#pragma unroll
for
(
int
l
=
0
;
l
<
nwarps
-
1
;
++
l
)
{
tmp
[
j
][
i
]
+=
tmp_shared
[
l
][
j
][
i
][
threadIdx
.
x
];
}
tmp
[
j
][
i
]
=
warp_reduce_sum
(
tmp
[
j
][
i
]);
}
if
(
threadIdx
.
x
<
rows_per_cuda_block
)
{
dst
[
j
*
nrows_dst
+
row0
+
threadIdx
.
x
]
=
tmp
[
j
][
threadIdx
.
x
];
}
}
}
template
<
int
qk
,
int
qi
,
typename
block_q_t
,
int
vdr
,
vec_dot_q_cuda_t
vec_dot
>
static
void
mul_mat_vec_q_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols_x
%
qk
==
0
);
GGML_ASSERT
(
ncols_y
<=
MMVQ_MAX_BATCH_SIZE
);
int
id
=
ggml_cuda_get_device
();
int64_t
nwarps
=
1
;
int64_t
rows_per_cuda_block
=
1
;
if
(
ggml_cuda_info
().
devices
[
id
].
cc
<
CC_RDNA2
)
{
// NVIDIA and AMD older than RDNA2
switch
(
ncols_y
)
{
case
1
:
nwarps
=
4
;
rows_per_cuda_block
=
1
;
break
;
case
2
:
case
3
:
case
4
:
nwarps
=
4
;
rows_per_cuda_block
=
2
;
break
;
case
5
:
case
6
:
case
7
:
case
8
:
nwarps
=
2
;
rows_per_cuda_block
=
2
;
break
;
default:
GGML_ASSERT
(
false
);
break
;
}
}
const
int64_t
nblocks
=
(
nrows_x
+
rows_per_cuda_block
-
1
)
/
rows_per_cuda_block
;
const
dim3
block_nums
(
nblocks
,
1
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
nwarps
,
1
);
switch
(
ncols_y
)
{
case
1
:
mul_mat_vec_q
<
1
,
qk
,
qi
,
block_q_t
,
vdr
,
vec_dot
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
nrows_dst
);
break
;
case
2
:
mul_mat_vec_q
<
2
,
qk
,
qi
,
block_q_t
,
vdr
,
vec_dot
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
nrows_dst
);
break
;
case
3
:
mul_mat_vec_q
<
3
,
qk
,
qi
,
block_q_t
,
vdr
,
vec_dot
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
nrows_dst
);
break
;
case
4
:
mul_mat_vec_q
<
4
,
qk
,
qi
,
block_q_t
,
vdr
,
vec_dot
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
nrows_dst
);
break
;
case
5
:
mul_mat_vec_q
<
5
,
qk
,
qi
,
block_q_t
,
vdr
,
vec_dot
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
nrows_dst
);
break
;
case
6
:
mul_mat_vec_q
<
6
,
qk
,
qi
,
block_q_t
,
vdr
,
vec_dot
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
nrows_dst
);
break
;
case
7
:
mul_mat_vec_q
<
7
,
qk
,
qi
,
block_q_t
,
vdr
,
vec_dot
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
nrows_dst
);
break
;
case
8
:
mul_mat_vec_q
<
8
,
qk
,
qi
,
block_q_t
,
vdr
,
vec_dot
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
nrows_dst
);
break
;
default:
GGML_ASSERT
(
false
);
break
;
}
}
static
void
mul_mat_vec_q4_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK4_0
,
QI4_0
,
block_q4_0
,
VDR_Q4_0_Q8_1_MMVQ
,
vec_dot_q4_0_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q4_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK4_1
,
QI4_1
,
block_q4_1
,
VDR_Q4_1_Q8_1_MMVQ
,
vec_dot_q4_1_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q5_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK5_0
,
QI5_0
,
block_q5_0
,
VDR_Q5_0_Q8_1_MMVQ
,
vec_dot_q5_0_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q5_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK5_1
,
QI5_1
,
block_q5_1
,
VDR_Q5_1_Q8_1_MMVQ
,
vec_dot_q5_1_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q8_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK8_0
,
QI8_0
,
block_q8_0
,
VDR_Q8_0_Q8_1_MMVQ
,
vec_dot_q8_0_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q2_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI2_K
,
block_q2_K
,
VDR_Q2_K_Q8_1_MMVQ
,
vec_dot_q2_K_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q3_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI3_K
,
block_q3_K
,
VDR_Q3_K_Q8_1_MMVQ
,
vec_dot_q3_K_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q4_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI4_K
,
block_q4_K
,
VDR_Q4_K_Q8_1_MMVQ
,
vec_dot_q4_K_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q5_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI5_K
,
block_q5_K
,
VDR_Q5_K_Q8_1_MMVQ
,
vec_dot_q5_K_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_q6_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI6_K
,
block_q6_K
,
VDR_Q6_K_Q8_1_MMVQ
,
vec_dot_q6_K_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq2_xxs_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI2_XXS
,
block_iq2_xxs
,
1
,
vec_dot_iq2_xxs_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq2_xs_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI2_XS
,
block_iq2_xs
,
1
,
vec_dot_iq2_xs_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq2_s_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI2_S
,
block_iq2_s
,
1
,
vec_dot_iq2_s_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq3_xxs_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI3_XXS
,
block_iq3_xxs
,
1
,
vec_dot_iq3_xxs_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq1_s_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI1_S
,
block_iq1_s
,
1
,
vec_dot_iq1_s_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq1_m_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI1_S
,
block_iq1_m
,
1
,
vec_dot_iq1_m_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq4_nl_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK4_NL
,
QI4_NL
,
block_iq4_nl
,
VDR_Q4_0_Q8_1_MMVQ
,
vec_dot_iq4_nl_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq4_xs_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI4_XS
,
block_iq4_xs
,
1
,
vec_dot_iq4_xs_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
static
void
mul_mat_vec_iq3_s_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
QK_K
,
QI3_XS
,
block_iq3_s
,
1
,
vec_dot_iq3_s_q8_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
void
ggml_cuda_op_mul_mat_vec_q
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
char
*
src0_dd_i
,
const
float
*
src1_ddf_i
,
const
char
*
src1_ddq_i
,
float
*
dst_dd_i
,
const
int64_t
row_low
,
const
int64_t
row_high
,
const
int64_t
src1_ncols
,
const
int64_t
src1_padded_row_size
,
cudaStream_t
stream
)
{
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
row_diff
=
row_high
-
row_low
;
const
int64_t
ne10
=
src1
->
ne
[
0
];
GGML_ASSERT
(
ne10
%
QK8_1
==
0
);
const
int64_t
ne0
=
dst
->
ne
[
0
];
int
id
=
ggml_cuda_get_device
();
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into
const
int64_t
nrows_dst
=
id
==
ctx
.
device
?
ne0
:
row_diff
;
switch
(
src0
->
type
)
{
case
GGML_TYPE_Q4_0
:
mul_mat_vec_q4_0_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q4_1
:
mul_mat_vec_q4_1_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q5_0
:
mul_mat_vec_q5_0_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q5_1
:
mul_mat_vec_q5_1_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q8_0
:
mul_mat_vec_q8_0_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q2_K
:
mul_mat_vec_q2_K_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q3_K
:
mul_mat_vec_q3_K_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q4_K
:
mul_mat_vec_q4_K_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q5_K
:
mul_mat_vec_q5_K_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_Q6_K
:
mul_mat_vec_q6_K_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ2_XXS
:
mul_mat_vec_iq2_xxs_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ2_XS
:
mul_mat_vec_iq2_xs_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ2_S
:
mul_mat_vec_iq2_s_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ3_XXS
:
mul_mat_vec_iq3_xxs_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ1_S
:
mul_mat_vec_iq1_s_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ1_M
:
mul_mat_vec_iq1_m_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ4_NL
:
mul_mat_vec_iq4_nl_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ4_XS
:
mul_mat_vec_iq4_xs_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
case
GGML_TYPE_IQ3_S
:
mul_mat_vec_iq3_s_q8_1_cuda
(
src0_dd_i
,
src1_ddq_i
,
dst_dd_i
,
ne00
,
row_diff
,
src1_padded_row_size
,
src1_ncols
,
nrows_dst
,
stream
);
break
;
default:
GGML_ASSERT
(
false
);
break
;
}
GGML_UNUSED
(
src1
);
GGML_UNUSED
(
dst
);
GGML_UNUSED
(
src1_ddf_i
);
GGML_UNUSED
(
src1_ncols
);
GGML_UNUSED
(
src1_padded_row_size
);
}
llm/llama.cpp/ggml-cuda/mmvq.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
void
ggml_cuda_op_mul_mat_vec_q
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
char
*
src0_dd_i
,
const
float
*
src1_ddf_i
,
const
char
*
src1_ddq_i
,
float
*
dst_dd_i
,
const
int64_t
row_low
,
const
int64_t
row_high
,
const
int64_t
src1_ncols
,
const
int64_t
src1_padded_row_size
,
cudaStream_t
stream
);
llm/llama.cpp/ggml-cuda/norm.cu
deleted
100644 → 0
View file @
97b02a89
#include "norm.cuh"
template
<
int
block_size
>
static
__global__
__launch_bounds__
(
1024
)
void
norm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
float
eps
)
{
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
const
int
tid
=
threadIdx
.
x
;
float2
mean_var
=
make_float2
(
0.
f
,
0.
f
);
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
block_size
)
{
const
float
xi
=
x
[
row
*
ncols
+
col
];
mean_var
.
x
+=
xi
;
mean_var
.
y
+=
xi
*
xi
;
}
// sum up partial sums
mean_var
=
warp_reduce_sum
(
mean_var
);
if
(
block_size
>
WARP_SIZE
)
{
__shared__
float2
s_sum
[
32
];
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
if
(
lane_id
==
0
)
{
s_sum
[
warp_id
]
=
mean_var
;
}
__syncthreads
();
mean_var
=
s_sum
[
lane_id
];
mean_var
=
warp_reduce_sum
(
mean_var
);
}
const
float
mean
=
mean_var
.
x
/
ncols
;
const
float
var
=
mean_var
.
y
/
ncols
-
mean
*
mean
;
const
float
inv_std
=
rsqrtf
(
var
+
eps
);
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
block_size
)
{
dst
[
row
*
ncols
+
col
]
=
(
x
[
row
*
ncols
+
col
]
-
mean
)
*
inv_std
;
}
}
template
<
int
block_size
>
static
__global__
__launch_bounds__
(
1024
)
void
group_norm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
group_size
,
const
int
ne_elements
,
const
float
eps
)
{
// blockIdx.x: num_groups idx
// threadIdx.x: block_size idx
int
start
=
blockIdx
.
x
*
group_size
;
int
end
=
start
+
group_size
;
start
+=
threadIdx
.
x
;
if
(
end
>=
ne_elements
)
{
end
=
ne_elements
;
}
float
tmp
=
0.0
f
;
// partial sum for thread in warp
for
(
int
j
=
start
;
j
<
end
;
j
+=
block_size
)
{
tmp
+=
x
[
j
];
}
tmp
=
warp_reduce_sum
(
tmp
);
if
(
block_size
>
WARP_SIZE
)
{
__shared__
float
s_sum
[
32
];
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
if
(
lane_id
==
0
)
{
s_sum
[
warp_id
]
=
tmp
;
}
__syncthreads
();
tmp
=
s_sum
[
lane_id
];
tmp
=
warp_reduce_sum
(
tmp
);
}
float
mean
=
tmp
/
group_size
;
tmp
=
0.0
f
;
for
(
int
j
=
start
;
j
<
end
;
j
+=
block_size
)
{
float
xi
=
x
[
j
]
-
mean
;
dst
[
j
]
=
xi
;
tmp
+=
xi
*
xi
;
}
tmp
=
warp_reduce_sum
(
tmp
);
if
(
block_size
>
WARP_SIZE
)
{
__shared__
float
s_sum
[
32
];
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
if
(
lane_id
==
0
)
{
s_sum
[
warp_id
]
=
tmp
;
}
__syncthreads
();
tmp
=
s_sum
[
lane_id
];
tmp
=
warp_reduce_sum
(
tmp
);
}
float
variance
=
tmp
/
group_size
;
float
scale
=
rsqrtf
(
variance
+
eps
);
for
(
int
j
=
start
;
j
<
end
;
j
+=
block_size
)
{
dst
[
j
]
*=
scale
;
}
}
template
<
int
block_size
>
static
__global__
__launch_bounds__
(
1024
)
void
rms_norm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
float
eps
)
{
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
const
int
tid
=
threadIdx
.
x
;
float
tmp
=
0.0
f
;
// partial sum for thread in warp
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
block_size
)
{
const
float
xi
=
x
[
row
*
ncols
+
col
];
tmp
+=
xi
*
xi
;
}
// sum up partial sums
tmp
=
warp_reduce_sum
(
tmp
);
if
(
block_size
>
WARP_SIZE
)
{
__shared__
float
s_sum
[
32
];
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
if
(
lane_id
==
0
)
{
s_sum
[
warp_id
]
=
tmp
;
}
__syncthreads
();
tmp
=
s_sum
[
lane_id
];
tmp
=
warp_reduce_sum
(
tmp
);
}
const
float
mean
=
tmp
/
ncols
;
const
float
scale
=
rsqrtf
(
mean
+
eps
);
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
block_size
)
{
dst
[
row
*
ncols
+
col
]
=
scale
*
x
[
row
*
ncols
+
col
];
}
}
static
void
norm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
const
float
eps
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
WARP_SIZE
==
0
);
if
(
ncols
<
1024
)
{
const
dim3
block_dims
(
WARP_SIZE
,
1
,
1
);
norm_f32
<
WARP_SIZE
><<<
nrows
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
eps
);
}
else
{
const
dim3
block_dims
(
1024
,
1
,
1
);
norm_f32
<
1024
><<<
nrows
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
eps
);
}
}
static
void
group_norm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
num_groups
,
const
int
group_size
,
const
int
ne_elements
,
cudaStream_t
stream
)
{
static
const
float
eps
=
1e-6
f
;
if
(
group_size
<
1024
)
{
const
dim3
block_dims
(
WARP_SIZE
,
1
,
1
);
group_norm_f32
<
WARP_SIZE
><<<
num_groups
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
group_size
,
ne_elements
,
eps
);
}
else
{
const
dim3
block_dims
(
1024
,
1
,
1
);
group_norm_f32
<
1024
><<<
num_groups
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
group_size
,
ne_elements
,
eps
);
}
}
static
void
rms_norm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
const
float
eps
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
WARP_SIZE
==
0
);
if
(
ncols
<
1024
)
{
const
dim3
block_dims
(
WARP_SIZE
,
1
,
1
);
rms_norm_f32
<
WARP_SIZE
><<<
nrows
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
eps
);
}
else
{
const
dim3
block_dims
(
1024
,
1
,
1
);
rms_norm_f32
<
1024
><<<
nrows
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
eps
);
}
}
void
ggml_cuda_op_norm
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
ggml_is_contiguous
(
src0
));
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
nrows
=
ggml_nrows
(
src0
);
float
eps
;
memcpy
(
&
eps
,
dst
->
op_params
,
sizeof
(
float
));
norm_f32_cuda
(
src0_d
,
dst_d
,
ne00
,
nrows
,
eps
,
stream
);
}
void
ggml_cuda_op_group_norm
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
ggml_is_contiguous
(
src0
));
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
int
num_groups
=
dst
->
op_params
[
0
];
int
group_size
=
src0
->
ne
[
0
]
*
src0
->
ne
[
1
]
*
((
src0
->
ne
[
2
]
+
num_groups
-
1
)
/
num_groups
);
group_norm_f32_cuda
(
src0_d
,
dst_d
,
num_groups
*
src0
->
ne
[
3
],
group_size
,
ggml_nelements
(
src0
),
stream
);
}
void
ggml_cuda_op_rms_norm
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
ggml_is_contiguous
(
src0
));
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
nrows
=
ggml_nrows
(
src0
);
float
eps
;
memcpy
(
&
eps
,
dst
->
op_params
,
sizeof
(
float
));
rms_norm_f32_cuda
(
src0_d
,
dst_d
,
ne00
,
nrows
,
eps
,
stream
);
}
llm/llama.cpp/ggml-cuda/norm.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
void
ggml_cuda_op_norm
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_group_norm
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_rms_norm
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
llm/llama.cpp/ggml-cuda/pad.cu
deleted
100644 → 0
View file @
97b02a89
#include "pad.cuh"
static
__global__
__launch_bounds__
(
1024
)
void
pad_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ne0
,
const
int
ne00
,
const
int
ne01
,
const
int
ne02
,
const
int
ne03
)
{
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
// blockIdx.y: idx of ne1
// blockIDx.x: idx of ne0 / BLOCK_SIZE
int
nidx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
nidx
>=
ne0
)
{
return
;
}
// operation
int
offset_dst
=
nidx
+
blockIdx
.
y
*
ne0
+
blockIdx
.
z
*
ne0
*
gridDim
.
y
;
if
(
nidx
<
ne00
&&
blockIdx
.
y
<
ne01
&&
blockIdx
.
z
<
ne02
*
ne03
)
{
int
offset_src
=
nidx
+
blockIdx
.
y
*
ne00
+
blockIdx
.
z
*
ne00
*
ne01
;
dst
[
offset_dst
]
=
x
[
offset_src
];
}
else
{
dst
[
offset_dst
]
=
0.0
f
;
}
}
static
void
pad_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ne00
,
const
int
ne01
,
const
int
ne02
,
const
int
ne03
,
const
int
ne0
,
const
int
ne1
,
const
int
ne2
,
const
int
ne3
,
cudaStream_t
stream
)
{
int
num_blocks
=
(
ne0
+
CUDA_PAD_BLOCK_SIZE
-
1
)
/
CUDA_PAD_BLOCK_SIZE
;
dim3
gridDim
(
num_blocks
,
ne1
,
ne2
*
ne3
);
pad_f32
<<<
gridDim
,
CUDA_PAD_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
ne0
,
ne00
,
ne01
,
ne02
,
ne03
);
}
void
ggml_cuda_op_pad
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0
->
ne
[
3
]
==
1
&&
dst
->
ne
[
3
]
==
1
);
// just 3D tensors
pad_f32_cuda
(
src0_d
,
dst_d
,
src0
->
ne
[
0
],
src0
->
ne
[
1
],
src0
->
ne
[
2
],
src0
->
ne
[
3
],
dst
->
ne
[
0
],
dst
->
ne
[
1
],
dst
->
ne
[
2
],
dst
->
ne
[
3
],
stream
);
}
llm/llama.cpp/ggml-cuda/pad.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
#define CUDA_PAD_BLOCK_SIZE 256
void
ggml_cuda_op_pad
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
llm/llama.cpp/ggml-cuda/pool2d.cu
deleted
100644 → 0
View file @
97b02a89
#include "pool2d.cuh"
template
<
typename
Ti
,
typename
To
>
static
__global__
void
pool2d_nchw_kernel
(
const
int
ih
,
const
int
iw
,
const
int
oh
,
const
int
ow
,
const
int
kh
,
const
int
kw
,
const
int
sh
,
const
int
sw
,
const
int
ph
,
const
int
pw
,
const
int
parallel_elements
,
const
Ti
*
src
,
To
*
dst
,
const
enum
ggml_op_pool
op
)
{
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
>=
parallel_elements
)
{
return
;
}
const
int
I_HW
=
ih
*
iw
;
const
int
O_HW
=
oh
*
ow
;
const
int
nc
=
idx
/
O_HW
;
const
int
cur_oh
=
idx
%
O_HW
/
ow
;
const
int
cur_ow
=
idx
%
O_HW
%
ow
;
const
Ti
*
i_ptr
=
src
+
nc
*
I_HW
;
To
*
o_ptr
=
dst
+
nc
*
O_HW
;
const
int
start_h
=
cur_oh
*
sh
-
ph
;
const
int
bh
=
max
(
0
,
start_h
);
const
int
eh
=
min
(
ih
,
start_h
+
kh
);
const
int
start_w
=
cur_ow
*
sw
-
pw
;
const
int
bw
=
max
(
0
,
start_w
);
const
int
ew
=
min
(
iw
,
start_w
+
kw
);
const
To
scale
=
1.
/
(
kh
*
kw
);
To
res
=
0
;
switch
(
op
)
{
case
GGML_OP_POOL_AVG
:
res
=
0
;
break
;
case
GGML_OP_POOL_MAX
:
res
=
-
FLT_MAX
;
break
;
default:
assert
(
false
);
}
for
(
int
i
=
bh
;
i
<
eh
;
i
+=
1
)
{
for
(
int
j
=
bw
;
j
<
ew
;
j
+=
1
)
{
#if __CUDA_ARCH__ >= 350
Ti
cur
=
__ldg
(
i_ptr
+
i
*
iw
+
j
);
#else
Ti
cur
=
i_ptr
[
i
*
iw
+
j
];
#endif
switch
(
op
)
{
case
GGML_OP_POOL_AVG
:
res
+=
cur
*
scale
;
break
;
case
GGML_OP_POOL_MAX
:
res
=
max
(
res
,
(
To
)
cur
);
break
;
default:
assert
(
false
);
}
}
}
o_ptr
[
cur_oh
*
ow
+
cur_ow
]
=
res
;
}
static
void
pool2d_nchw_kernel_f32_f32_cuda
(
const
int
ih
,
const
int
iw
,
const
int
oh
,
const
int
ow
,
const
int
kh
,
const
int
kw
,
const
int
sh
,
const
int
sw
,
const
int
ph
,
const
int
pw
,
const
int
parallel_elements
,
const
float
*
src
,
float
*
dst
,
const
enum
ggml_op_pool
op
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
parallel_elements
+
CUDA_POOL2D_BLOCK_SIZE
-
1
)
/
CUDA_POOL2D_BLOCK_SIZE
;
dim3
block_nums
(
num_blocks
);
pool2d_nchw_kernel
<<<
block_nums
,
CUDA_POOL2D_BLOCK_SIZE
,
0
,
stream
>>>
(
ih
,
iw
,
oh
,
ow
,
kh
,
kw
,
sh
,
sw
,
ph
,
pw
,
parallel_elements
,
src
,
dst
,
op
);
}
void
ggml_cuda_op_pool2d
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
const
int32_t
*
opts
=
(
const
int32_t
*
)
dst
->
op_params
;
enum
ggml_op_pool
op
=
static_cast
<
ggml_op_pool
>
(
opts
[
0
]);
const
int
k0
=
opts
[
1
];
const
int
k1
=
opts
[
2
];
const
int
s0
=
opts
[
3
];
const
int
s1
=
opts
[
4
];
const
int
p0
=
opts
[
5
];
const
int
p1
=
opts
[
6
];
const
int64_t
IH
=
src0
->
ne
[
1
];
const
int64_t
IW
=
src0
->
ne
[
0
];
const
int64_t
N
=
dst
->
ne
[
3
];
const
int64_t
OC
=
dst
->
ne
[
2
];
const
int64_t
OH
=
dst
->
ne
[
1
];
const
int64_t
OW
=
dst
->
ne
[
0
];
const
int
parallel_elements
=
N
*
OC
*
OH
*
OW
;
pool2d_nchw_kernel_f32_f32_cuda
(
IH
,
IW
,
OH
,
OW
,
k1
,
k0
,
s1
,
s0
,
p1
,
p0
,
parallel_elements
,
src0_d
,
dst_d
,
op
,
stream
);
}
llm/llama.cpp/ggml-cuda/pool2d.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
#define CUDA_POOL2D_BLOCK_SIZE 256
void
ggml_cuda_op_pool2d
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
llm/llama.cpp/ggml-cuda/quantize.cu
deleted
100644 → 0
View file @
97b02a89
#include "quantize.cuh"
static
__global__
__launch_bounds__
(
1024
)
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx
,
const
int64_t
kx_padded
)
{
const
int64_t
ix
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
ix
>=
kx_padded
)
{
return
;
}
const
int64_t
iy
=
(
int64_t
)
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int64_t
i_padded
=
(
int64_t
)
iy
*
kx_padded
+
ix
;
block_q8_1
*
y
=
(
block_q8_1
*
)
vy
;
const
int64_t
ib
=
i_padded
/
QK8_1
;
// block index
const
int64_t
iqs
=
i_padded
%
QK8_1
;
// quant index
const
float
xi
=
ix
<
kx
?
x
[
iy
*
kx
+
ix
]
:
0.0
f
;
float
amax
=
fabsf
(
xi
);
float
sum
=
xi
;
amax
=
warp_reduce_max
(
amax
);
sum
=
warp_reduce_sum
(
sum
);
const
float
d
=
amax
/
127
;
const
int8_t
q
=
amax
==
0.0
f
?
0
:
roundf
(
xi
/
d
);
y
[
ib
].
qs
[
iqs
]
=
q
;
if
(
iqs
>
0
)
{
return
;
}
reinterpret_cast
<
half
&>
(
y
[
ib
].
ds
.
x
)
=
d
;
reinterpret_cast
<
half
&>
(
y
[
ib
].
ds
.
y
)
=
sum
;
}
void
quantize_row_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx
,
const
int64_t
ky
,
const
int64_t
kx_padded
,
cudaStream_t
stream
)
{
const
int64_t
block_num_x
=
(
kx_padded
+
CUDA_QUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_QUANTIZE_BLOCK_SIZE
;
const
dim3
num_blocks
(
block_num_x
,
ky
,
1
);
const
dim3
block_size
(
CUDA_QUANTIZE_BLOCK_SIZE
,
1
,
1
);
quantize_q8_1
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx
,
kx_padded
);
}
llm/llama.cpp/ggml-cuda/quantize.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
#define CUDA_QUANTIZE_BLOCK_SIZE 256
void
quantize_row_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx
,
const
int64_t
ky
,
const
int64_t
kx_padded
,
cudaStream_t
stream
);
llm/llama.cpp/ggml-cuda/rope.cu
deleted
100644 → 0
View file @
97b02a89
#include "rope.cuh"
struct
rope_corr_dims
{
float
v
[
4
];
};
static
__device__
float
rope_yarn_ramp
(
const
float
low
,
const
float
high
,
const
int
i0
)
{
const
float
y
=
(
i0
/
2
-
low
)
/
max
(
0.001
f
,
high
-
low
);
return
1.0
f
-
min
(
1.0
f
,
max
(
0.0
f
,
y
));
}
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
static
__device__
void
rope_yarn
(
float
theta_extrap
,
float
freq_scale
,
rope_corr_dims
corr_dims
,
int64_t
i0
,
float
ext_factor
,
float
mscale
,
float
*
cos_theta
,
float
*
sin_theta
)
{
// Get n-d rotational scaling corrected for extrapolation
float
theta_interp
=
freq_scale
*
theta_extrap
;
float
theta
=
theta_interp
;
if
(
ext_factor
!=
0.0
f
)
{
float
ramp_mix
=
rope_yarn_ramp
(
corr_dims
.
v
[
0
],
corr_dims
.
v
[
1
],
i0
)
*
ext_factor
;
theta
=
theta_interp
*
(
1
-
ramp_mix
)
+
theta_extrap
*
ramp_mix
;
// Get n-d magnitude scaling corrected for interpolation
mscale
*=
1.0
f
+
0.1
f
*
logf
(
1.0
f
/
freq_scale
);
}
*
cos_theta
=
cosf
(
theta
)
*
mscale
;
*
sin_theta
=
sinf
(
theta
)
*
mscale
;
}
// rope == RoPE == rotary positional embedding
template
<
typename
T
,
bool
has_pos
>
static
__global__
__launch_bounds__
(
1024
)
void
rope
(
const
T
*
x
,
T
*
dst
,
int
ncols
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
)
{
const
int
col
=
2
*
(
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
);
if
(
col
>=
ncols
)
{
return
;
}
const
int
row
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
i
=
row
*
ncols
+
col
;
const
int
i2
=
row
/
p_delta_rows
;
const
int
p
=
has_pos
?
pos
[
i2
]
:
0
;
const
float
theta_base
=
p
*
powf
(
freq_base
,
-
float
(
col
)
/
ncols
);
float
cos_theta
,
sin_theta
;
rope_yarn
(
theta_base
,
freq_scale
,
corr_dims
,
col
,
ext_factor
,
attn_factor
,
&
cos_theta
,
&
sin_theta
);
const
float
x0
=
x
[
i
+
0
];
const
float
x1
=
x
[
i
+
1
];
dst
[
i
+
0
]
=
x0
*
cos_theta
-
x1
*
sin_theta
;
dst
[
i
+
1
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
}
template
<
typename
T
,
bool
has_pos
,
bool
has_freq_facs
>
static
__global__
__launch_bounds__
(
1024
)
void
rope_neox
(
const
T
*
x
,
T
*
dst
,
int
ncols
,
int
n_dims
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
float
theta_scale
,
const
float
*
freq_factors
)
{
const
int
col
=
2
*
(
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
);
if
(
col
>=
ncols
)
{
return
;
}
const
int
row
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
ib
=
col
/
n_dims
;
const
int
ic
=
col
%
n_dims
;
if
(
ib
>
0
)
{
const
int
i
=
row
*
ncols
+
ib
*
n_dims
+
ic
;
dst
[
i
+
0
]
=
x
[
i
+
0
];
dst
[
i
+
1
]
=
x
[
i
+
1
];
return
;
}
const
int
i
=
row
*
ncols
+
ib
*
n_dims
+
ic
/
2
;
const
int
i2
=
row
/
p_delta_rows
;
const
int
p
=
has_pos
?
pos
[
i2
]
:
0
;
const
float
freq_factor
=
has_freq_facs
?
freq_factors
[
ic
/
2
]
:
1.0
f
;
const
float
theta_base
=
p
*
powf
(
theta_scale
,
col
/
2.0
f
)
/
freq_factor
;
float
cos_theta
,
sin_theta
;
rope_yarn
(
theta_base
,
freq_scale
,
corr_dims
,
ic
,
ext_factor
,
attn_factor
,
&
cos_theta
,
&
sin_theta
);
const
float
x0
=
x
[
i
+
0
];
const
float
x1
=
x
[
i
+
n_dims
/
2
];
dst
[
i
+
0
]
=
x0
*
cos_theta
-
x1
*
sin_theta
;
dst
[
i
+
n_dims
/
2
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
}
static
__global__
__launch_bounds__
(
1024
)
void
rope_glm_f32
(
const
float
*
x
,
float
*
dst
,
int
ncols
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
int
n_ctx
)
{
const
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
half_n_dims
=
ncols
/
4
;
if
(
col
>=
half_n_dims
)
{
return
;
}
const
int
row
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i
=
row
*
ncols
+
col
;
const
int
i2
=
row
/
p_delta_rows
;
const
float
col_theta_scale
=
powf
(
freq_base
,
-
2.0
f
*
col
/
ncols
);
// FIXME: this is likely wrong
const
int
p
=
pos
!=
nullptr
?
pos
[
i2
]
:
0
;
const
float
theta
=
min
(
p
,
n_ctx
-
2
)
*
freq_scale
*
col_theta_scale
;
const
float
sin_theta
=
sinf
(
theta
);
const
float
cos_theta
=
cosf
(
theta
);
const
float
x0
=
x
[
i
+
0
];
const
float
x1
=
x
[
i
+
half_n_dims
];
dst
[
i
+
0
]
=
x0
*
cos_theta
-
x1
*
sin_theta
;
dst
[
i
+
half_n_dims
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
const
float
block_theta
=
((
float
)
max
(
p
-
n_ctx
-
2
,
0
))
*
col_theta_scale
;
const
float
sin_block_theta
=
sinf
(
block_theta
);
const
float
cos_block_theta
=
cosf
(
block_theta
);
const
float
x2
=
x
[
i
+
half_n_dims
*
2
];
const
float
x3
=
x
[
i
+
half_n_dims
*
3
];
dst
[
i
+
half_n_dims
*
2
]
=
x2
*
cos_block_theta
-
x3
*
sin_block_theta
;
dst
[
i
+
half_n_dims
*
3
]
=
x2
*
sin_block_theta
+
x3
*
cos_block_theta
;
}
template
<
typename
T
>
static
void
rope_cuda
(
const
T
*
x
,
T
*
dst
,
int
ncols
,
int
nrows
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
2
==
0
);
const
dim3
block_dims
(
1
,
CUDA_ROPE_BLOCK_SIZE
,
1
);
const
int
num_blocks_x
=
(
ncols
+
2
*
CUDA_ROPE_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_ROPE_BLOCK_SIZE
);
const
dim3
block_nums
(
nrows
,
num_blocks_x
,
1
);
if
(
pos
==
nullptr
)
{
rope
<
T
,
false
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
);
}
else
{
rope
<
T
,
true
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
);
}
}
template
<
typename
T
>
static
void
rope_neox_cuda
(
const
T
*
x
,
T
*
dst
,
int
ncols
,
int
n_dims
,
int
nrows
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
2
==
0
);
const
dim3
block_dims
(
1
,
CUDA_ROPE_BLOCK_SIZE
,
1
);
const
int
num_blocks_x
=
(
ncols
+
2
*
CUDA_ROPE_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_ROPE_BLOCK_SIZE
);
const
dim3
block_nums
(
nrows
,
num_blocks_x
,
1
);
const
float
theta_scale
=
powf
(
freq_base
,
-
2.0
f
/
n_dims
);
if
(
pos
==
nullptr
)
{
if
(
freq_factors
==
nullptr
)
{
rope_neox
<
T
,
false
,
false
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
n_dims
,
pos
,
freq_scale
,
p_delta_rows
,
ext_factor
,
attn_factor
,
corr_dims
,
theta_scale
,
freq_factors
);
}
else
{
rope_neox
<
T
,
false
,
true
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
n_dims
,
pos
,
freq_scale
,
p_delta_rows
,
ext_factor
,
attn_factor
,
corr_dims
,
theta_scale
,
freq_factors
);
}
}
else
{
if
(
freq_factors
==
nullptr
)
{
rope_neox
<
T
,
true
,
false
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
n_dims
,
pos
,
freq_scale
,
p_delta_rows
,
ext_factor
,
attn_factor
,
corr_dims
,
theta_scale
,
freq_factors
);
}
else
{
rope_neox
<
T
,
true
,
true
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
n_dims
,
pos
,
freq_scale
,
p_delta_rows
,
ext_factor
,
attn_factor
,
corr_dims
,
theta_scale
,
freq_factors
);
}
}
}
static
void
rope_glm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
int
ncols
,
int
nrows
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
int
n_ctx
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
4
==
0
);
const
dim3
block_dims
(
CUDA_ROPE_BLOCK_SIZE
/
4
,
1
,
1
);
const
int
num_blocks_x
=
(
ncols
+
CUDA_ROPE_BLOCK_SIZE
-
1
)
/
CUDA_ROPE_BLOCK_SIZE
;
const
dim3
block_nums
(
num_blocks_x
,
nrows
,
1
);
rope_glm_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
n_ctx
);
}
static
void
rope_cuda_f16
(
const
half
*
x
,
half
*
dst
,
int
ncols
,
int
nrows
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
cudaStream_t
stream
)
{
rope_cuda
<
half
>
(
x
,
dst
,
ncols
,
nrows
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
stream
);
}
static
void
rope_cuda_f32
(
const
float
*
x
,
float
*
dst
,
int
ncols
,
int
nrows
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
cudaStream_t
stream
)
{
rope_cuda
<
float
>
(
x
,
dst
,
ncols
,
nrows
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
stream
);
}
static
void
rope_neox_cuda_f16
(
const
half
*
x
,
half
*
dst
,
int
ncols
,
int
n_dims
,
int
nrows
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
cudaStream_t
stream
)
{
rope_neox_cuda
<
half
>
(
x
,
dst
,
ncols
,
n_dims
,
nrows
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
stream
);
}
static
void
rope_neox_cuda_f32
(
const
float
*
x
,
float
*
dst
,
int
ncols
,
int
n_dims
,
int
nrows
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
cudaStream_t
stream
)
{
rope_neox_cuda
<
float
>
(
x
,
dst
,
ncols
,
n_dims
,
nrows
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
stream
);
}
void
ggml_cuda_op_rope
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
const
ggml_tensor
*
src2
=
dst
->
src
[
2
];
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
const
float
*
src1_d
=
(
const
float
*
)
src1
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
ggml_is_contiguous
(
src0
));
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
||
src0
->
type
==
GGML_TYPE_F16
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
||
dst
->
type
==
GGML_TYPE_F16
);
GGML_ASSERT
(
src0
->
type
==
dst
->
type
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
ne01
=
src0
->
ne
[
1
];
const
int64_t
nrows
=
ggml_nrows
(
src0
);
//const int n_past = ((int32_t *) dst->op_params)[0];
const
int
n_dims
=
((
int32_t
*
)
dst
->
op_params
)[
1
];
const
int
mode
=
((
int32_t
*
)
dst
->
op_params
)[
2
];
const
int
n_ctx
=
((
int32_t
*
)
dst
->
op_params
)[
3
];
const
int
n_orig_ctx
=
((
int32_t
*
)
dst
->
op_params
)[
4
];
// RoPE alteration for extended context
float
freq_base
,
freq_scale
,
ext_factor
,
attn_factor
,
beta_fast
,
beta_slow
;
memcpy
(
&
freq_base
,
(
int32_t
*
)
dst
->
op_params
+
5
,
sizeof
(
float
));
memcpy
(
&
freq_scale
,
(
int32_t
*
)
dst
->
op_params
+
6
,
sizeof
(
float
));
memcpy
(
&
ext_factor
,
(
int32_t
*
)
dst
->
op_params
+
7
,
sizeof
(
float
));
memcpy
(
&
attn_factor
,
(
int32_t
*
)
dst
->
op_params
+
8
,
sizeof
(
float
));
memcpy
(
&
beta_fast
,
(
int32_t
*
)
dst
->
op_params
+
9
,
sizeof
(
float
));
memcpy
(
&
beta_slow
,
(
int32_t
*
)
dst
->
op_params
+
10
,
sizeof
(
float
));
const
float
*
freq_factors
=
nullptr
;
const
int32_t
*
pos
=
nullptr
;
const
bool
is_neox
=
mode
&
2
;
const
bool
is_glm
=
mode
&
4
;
pos
=
(
const
int32_t
*
)
src1_d
;
if
(
is_neox
)
{
if
(
src2
!=
nullptr
)
{
freq_factors
=
(
const
float
*
)
src2
->
data
;
}
}
else
{
GGML_ASSERT
(
src2
==
nullptr
&&
"TODO: freq_factors not implemented for !is_neox"
);
}
rope_corr_dims
corr_dims
;
ggml_rope_yarn_corr_dims
(
n_dims
,
n_orig_ctx
,
freq_base
,
beta_fast
,
beta_slow
,
corr_dims
.
v
);
// compute
if
(
is_glm
)
{
GGML_ASSERT
(
false
);
rope_glm_f32_cuda
(
src0_d
,
dst_d
,
ne00
,
nrows
,
pos
,
freq_scale
,
ne01
,
freq_base
,
n_ctx
,
stream
);
}
else
if
(
is_neox
)
{
if
(
src0
->
type
==
GGML_TYPE_F32
)
{
rope_neox_cuda_f32
(
(
const
float
*
)
src0_d
,
(
float
*
)
dst_d
,
ne00
,
n_dims
,
nrows
,
pos
,
freq_scale
,
ne01
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
stream
);
}
else
if
(
src0
->
type
==
GGML_TYPE_F16
)
{
rope_neox_cuda_f16
(
(
const
half
*
)
src0_d
,
(
half
*
)
dst_d
,
ne00
,
n_dims
,
nrows
,
pos
,
freq_scale
,
ne01
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
stream
);
}
else
{
GGML_ASSERT
(
false
);
}
}
else
{
if
(
src0
->
type
==
GGML_TYPE_F32
)
{
rope_cuda_f32
(
(
const
float
*
)
src0_d
,
(
float
*
)
dst_d
,
ne00
,
nrows
,
pos
,
freq_scale
,
ne01
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
stream
);
}
else
if
(
src0
->
type
==
GGML_TYPE_F16
)
{
rope_cuda_f16
(
(
const
half
*
)
src0_d
,
(
half
*
)
dst_d
,
ne00
,
nrows
,
pos
,
freq_scale
,
ne01
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
stream
);
}
else
{
GGML_ASSERT
(
false
);
}
}
}
llm/llama.cpp/ggml-cuda/rope.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
#define CUDA_ROPE_BLOCK_SIZE 256
void
ggml_cuda_op_rope
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
llm/llama.cpp/ggml-cuda/scale.cu
deleted
100644 → 0
View file @
97b02a89
#include "scale.cuh"
static
__global__
__launch_bounds__
(
1024
)
void
scale_f32
(
const
float
*
x
,
float
*
dst
,
const
float
scale
,
const
int
k
)
{
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
return
;
}
dst
[
i
]
=
scale
*
x
[
i
];
}
static
void
scale_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
float
scale
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_SCALE_BLOCK_SIZE
-
1
)
/
CUDA_SCALE_BLOCK_SIZE
;
scale_f32
<<<
num_blocks
,
CUDA_SCALE_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
scale
,
k
);
}
void
ggml_cuda_op_scale
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
float
scale
;
memcpy
(
&
scale
,
dst
->
op_params
,
sizeof
(
float
));
scale_f32_cuda
(
src0_d
,
dst_d
,
scale
,
ggml_nelements
(
src0
),
stream
);
}
llm/llama.cpp/ggml-cuda/scale.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
#define CUDA_SCALE_BLOCK_SIZE 256
void
ggml_cuda_op_scale
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
llm/llama.cpp/ggml-cuda/softmax.cu
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
#include "softmax.cuh"
template
<
typename
T
>
static
__device__
__forceinline__
float
t2f32
(
T
val
)
{
return
(
float
)
val
;
}
template
<
>
__device__
float
__forceinline__
t2f32
<
half
>
(
half
val
)
{
return
__half2float
(
val
);
}
template
<
bool
vals_smem
,
int
ncols_template
,
int
block_size_template
,
typename
T
>
static
__global__
__launch_bounds__
(
1024
)
void
soft_max_f32
(
const
float
*
x
,
const
T
*
mask
,
float
*
dst
,
const
int
ncols_par
,
const
int
nrows_y
,
const
float
scale
,
const
float
max_bias
,
const
float
m0
,
const
float
m1
,
uint32_t
n_head_log2
)
{
const
int
ncols
=
ncols_template
==
0
?
ncols_par
:
ncols_template
;
const
int
tid
=
threadIdx
.
x
;
const
int
rowx
=
blockIdx
.
x
;
const
int
rowy
=
rowx
%
nrows_y
;
// broadcast the mask in the row dimension
const
int
block_size
=
block_size_template
==
0
?
blockDim
.
x
:
block_size_template
;
const
int
warp_id
=
threadIdx
.
x
/
WARP_SIZE
;
const
int
lane_id
=
threadIdx
.
x
%
WARP_SIZE
;
const
float
slope
=
get_alibi_slope
(
max_bias
,
rowx
/
nrows_y
,
n_head_log2
,
m0
,
m1
);
extern
__shared__
float
data_soft_max_f32
[];
float
*
buf_iw
=
data_soft_max_f32
;
// shared memory buffer for inter-warp communication
// shared memory buffer to cache values between iterations:
float
*
vals
=
vals_smem
?
buf_iw
+
WARP_SIZE
:
dst
+
(
int64_t
)
rowx
*
ncols
;
float
max_val
=
-
INFINITY
;
#pragma unroll
for
(
int
col0
=
0
;
col0
<
ncols
;
col0
+=
block_size
)
{
const
int
col
=
col0
+
tid
;
if
(
ncols_template
==
0
&&
col
>=
ncols
)
{
break
;
}
const
int64_t
ix
=
(
int64_t
)
rowx
*
ncols
+
col
;
const
int64_t
iy
=
(
int64_t
)
rowy
*
ncols
+
col
;
const
float
val
=
x
[
ix
]
*
scale
+
(
mask
?
slope
*
t2f32
(
mask
[
iy
])
:
0.0
f
);
vals
[
col
]
=
val
;
max_val
=
max
(
max_val
,
val
);
}
// find the max value in the block
max_val
=
warp_reduce_max
(
max_val
);
if
(
block_size
>
WARP_SIZE
)
{
if
(
warp_id
==
0
)
{
buf_iw
[
lane_id
]
=
-
INFINITY
;
}
__syncthreads
();
if
(
lane_id
==
0
)
{
buf_iw
[
warp_id
]
=
max_val
;
}
__syncthreads
();
max_val
=
buf_iw
[
lane_id
];
max_val
=
warp_reduce_max
(
max_val
);
}
float
tmp
=
0.0
f
;
// partial sum
#pragma unroll
for
(
int
col0
=
0
;
col0
<
ncols
;
col0
+=
block_size
)
{
const
int
col
=
col0
+
tid
;
if
(
ncols_template
==
0
&&
col
>=
ncols
)
{
break
;
}
const
float
val
=
expf
(
vals
[
col
]
-
max_val
);
tmp
+=
val
;
vals
[
col
]
=
val
;
}
// find the sum of exps in the block
tmp
=
warp_reduce_sum
(
tmp
);
if
(
block_size
>
WARP_SIZE
)
{
__syncthreads
();
if
(
warp_id
==
0
)
{
buf_iw
[
lane_id
]
=
0.0
f
;
}
__syncthreads
();
if
(
lane_id
==
0
)
{
buf_iw
[
warp_id
]
=
tmp
;
}
__syncthreads
();
tmp
=
buf_iw
[
lane_id
];
tmp
=
warp_reduce_sum
(
tmp
);
}
const
float
inv_sum
=
1.0
f
/
tmp
;
#pragma unroll
for
(
int
col0
=
0
;
col0
<
ncols
;
col0
+=
block_size
)
{
const
int
col
=
col0
+
tid
;
if
(
ncols_template
==
0
&&
col
>=
ncols
)
{
return
;
}
const
int64_t
idst
=
(
int64_t
)
rowx
*
ncols
+
col
;
dst
[
idst
]
=
vals
[
col
]
*
inv_sum
;
}
}
template
<
typename
T
>
static
void
soft_max_f32_cuda
(
const
float
*
x
,
const
T
*
mask
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
float
scale
,
const
float
max_bias
,
cudaStream_t
stream
)
{
int
nth
=
WARP_SIZE
;
//32
// printf("warpsize: %d\n", WARP_SIZE);
// printf("softmax size: %d\n", CUDA_SOFT_MAX_BLOCK_SIZE); // 256
while
(
nth
<
ncols_x
&&
nth
<
CUDA_SOFT_MAX_BLOCK_SIZE
)
nth
*=
2
;
// printf("ncols_x: %d\n", ncols_x);
// printf("nth: %d\n", nth);
const
dim3
block_dims
(
nth
,
1
,
1
);
const
dim3
block_nums
(
nrows_x
,
1
,
1
);
const
size_t
shmem
=
(
GGML_PAD
(
ncols_x
,
WARP_SIZE
)
+
WARP_SIZE
)
*
sizeof
(
float
);
static_assert
(
CUDA_SOFT_MAX_BLOCK_SIZE
==
1024
,
"These values need to be adjusted."
);
const
uint32_t
n_head
=
nrows_x
/
nrows_y
;
const
uint32_t
n_head_log2
=
1u
<<
(
uint32_t
)
floorf
(
log2f
((
float
)
n_head
));
const
float
m0
=
powf
(
2.0
f
,
-
(
max_bias
)
/
n_head_log2
);
const
float
m1
=
powf
(
2.0
f
,
-
(
max_bias
/
2.0
f
)
/
n_head_log2
);
if
(
shmem
<
ggml_cuda_info
().
devices
[
ggml_cuda_get_device
()].
smpb
)
{
switch
(
ncols_x
)
{
case
32
:
soft_max_f32
<
true
,
32
,
32
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
case
64
:
soft_max_f32
<
true
,
64
,
64
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
case
128
:
soft_max_f32
<
true
,
128
,
128
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
case
256
:
soft_max_f32
<
true
,
256
,
256
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
case
512
:
soft_max_f32
<
true
,
512
,
512
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
case
1024
:
soft_max_f32
<
true
,
1024
,
1024
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
case
2048
:
soft_max_f32
<
true
,
2048
,
1024
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
case
4096
:
soft_max_f32
<
true
,
4096
,
1024
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
default:
soft_max_f32
<
true
,
0
,
0
><<<
block_nums
,
block_dims
,
shmem
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
break
;
}
}
else
{
const
size_t
shmem_low
=
WARP_SIZE
*
sizeof
(
float
);
printf
(
"%d
\n
"
,
ncols_x
);
// printf("%d, %d, %d", block_nums, block_dims, shmem_low);
soft_max_f32
<
false
,
0
,
0
><<<
block_nums
,
block_dims
,
shmem_low
,
stream
>>>
(
x
,
mask
,
dst
,
ncols_x
,
nrows_y
,
scale
,
max_bias
,
m0
,
m1
,
n_head_log2
);
}
}
void
ggml_cuda_op_soft_max
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
const
void
*
src1_d
=
src1
?
(
const
void
*
)
src1
->
data
:
nullptr
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
!
src1
||
src1
->
type
==
GGML_TYPE_F16
||
src1
->
type
==
GGML_TYPE_F32
);
// src1 contains mask and it is optional
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
nrows_x
=
ggml_nrows
(
src0
);
const
int64_t
nrows_y
=
src0
->
ne
[
1
];
float
scale
=
1.0
f
;
float
max_bias
=
0.0
f
;
memcpy
(
&
scale
,
(
float
*
)
dst
->
op_params
+
0
,
sizeof
(
float
));
memcpy
(
&
max_bias
,
(
float
*
)
dst
->
op_params
+
1
,
sizeof
(
float
));
const
bool
use_f16
=
(
src1
&&
src1
->
type
==
GGML_TYPE_F16
);
if
(
use_f16
)
{
const
half
*
src1_dd
=
(
const
half
*
)
src1_d
;
soft_max_f32_cuda
(
src0_d
,
src1_dd
,
dst_d
,
ne00
,
nrows_x
,
nrows_y
,
scale
,
max_bias
,
stream
);
}
else
{
const
float
*
src1_dd
=
(
const
float
*
)
src1_d
;
soft_max_f32_cuda
(
src0_d
,
src1_dd
,
dst_d
,
ne00
,
nrows_x
,
nrows_y
,
scale
,
max_bias
,
stream
);
}
}
llm/llama.cpp/ggml-cuda/softmax.cuh
deleted
100644 → 0
View file @
97b02a89
#include "common.cuh"
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
void
ggml_cuda_op_soft_max
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment