Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
386c53bb
Commit
386c53bb
authored
Mar 12, 2025
by
xuxzh1
🎱
Browse files
update
parent
4667452a
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
597 additions
and
601 deletions
+597
-601
llama/ggml-cuda/mmvq.cu
llama/ggml-cuda/mmvq.cu
+424
-429
llama/ggml-cuda/quantize.cu
llama/ggml-cuda/quantize.cu
+171
-170
llama/ggml-cuda/vecdotq.cuh
llama/ggml-cuda/vecdotq.cuh
+2
-2
No files found.
llama/ggml-cuda/mmvq.cu
View file @
386c53bb
...
...
@@ -24,12 +24,12 @@
* SOFTWARE.
*/
#include "mmvq.cuh"
#include "vecdotq.cuh"
#include "mmvq.cuh"
#include "vecdotq.cuh"
typedef
float
(
*
vec_dot_q_cuda_t
)(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
kbx
,
const
int
&
iqs
);
typedef
float
(
*
vec_dot_q_cuda_t
)(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
&
kbx
,
const
int
&
iqs
);
static
constexpr
__device__
vec_dot_q_cuda_t
get_vec_dot_q_cuda
(
ggml_type
type
)
{
static
constexpr
__device__
vec_dot_q_cuda_t
get_vec_dot_q_cuda
(
ggml_type
type
)
{
return
type
==
GGML_TYPE_Q4_0
?
vec_dot_q4_0_q8_1
:
type
==
GGML_TYPE_Q4_1
?
vec_dot_q4_1_q8_1
:
type
==
GGML_TYPE_Q5_0
?
vec_dot_q5_0_q8_1
:
...
...
@@ -50,9 +50,9 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
type
==
GGML_TYPE_IQ4_XS
?
vec_dot_iq4_xs_q8_1
:
type
==
GGML_TYPE_IQ3_S
?
vec_dot_iq3_s_q8_1
:
nullptr
;
}
}
static
constexpr
__device__
int
get_vdr_mmvq
(
ggml_type
type
)
{
static
constexpr
__device__
int
get_vdr_mmvq
(
ggml_type
type
)
{
return
type
==
GGML_TYPE_Q4_0
?
VDR_Q4_0_Q8_1_MMVQ
:
type
==
GGML_TYPE_Q4_1
?
VDR_Q4_1_Q8_1_MMVQ
:
type
==
GGML_TYPE_Q5_0
?
VDR_Q5_0_Q8_1_MMVQ
:
...
...
@@ -71,30 +71,29 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
type
==
GGML_TYPE_IQ4_NL
?
VDR_IQ4_NL_Q8_1_MMVQ
:
type
==
GGML_TYPE_IQ4_XS
?
VDR_IQ4_XS_Q8_1_MMVQ
:
1
;
}
template
<
ggml_type
type
,
int
ncols_y
>
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__
((
ncols_y
<=
4
?
4
:
2
)
*
16
,
1
)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static
__global__
void
mul_mat_vec_q
(
}
template
<
ggml_type
type
,
int
ncols_y
>
//
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
//
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*
WARP_SIZE
, 1)
//
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static
__global__
void
mul_mat_vec_q
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
constexpr
int
qi
=
ggml_cuda_type_traits
<
type
>::
qi
;
constexpr
int
vdr
=
get_vdr_mmvq
(
type
);
constexpr
vec_dot_q_cuda_t
vec_dot_q_cuda
=
get_vec_dot_q_cuda
(
type
);
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
constexpr
int
nwarps
=
1
;
constexpr
int
rows_per_cuda_block
=
1
;
#else
#else
constexpr
int
nwarps
=
ncols_y
<=
4
?
4
:
2
;
constexpr
int
rows_per_cuda_block
=
ncols_y
==
1
?
1
:
2
;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
constexpr
int
rows_per_cuda_block
=
ncols_y
==
1
?
1
:
1
;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
const
int
tid
=
WARP_SIZE
*
threadIdx
.
y
+
threadIdx
.
x
;
const
int
row0
=
rows_per_cuda_block
*
blockIdx
.
x
;
...
...
@@ -102,7 +101,7 @@ static __global__ void mul_mat_vec_q(
const
int
blocks_per_col_y
=
nrows_y
/
QK8_1
;
constexpr
int
blocks_per_iter
=
vdr
*
nwarps
*
WARP_SIZE
/
qi
;
// partial sum for each thread
// partial sum for each thread
float
tmp
[
ncols_y
][
rows_per_cuda_block
]
=
{
0.0
f
};
const
block_q8_1
*
y
=
(
const
block_q8_1
*
)
vy
;
...
...
@@ -112,24 +111,22 @@ static __global__ void mul_mat_vec_q(
// x block quant index when casting the quants to int
const
int
kqs
=
vdr
*
(
tid
%
(
qi
/
vdr
));
#pragma unroll ncols_y
#pragma unroll ncols_y
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll rows_per_cuda_block
#pragma unroll rows_per_cuda_block
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
//tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
atomicAdd
(
&
tmp
[
j
][
i
],
vec_dot_q_cuda
(
vx
,
&
y
[
j
*
blocks_per_col_y
+
kby
],
(
row0
+
i
)
*
blocks_per_row_x
+
kbx
,
kqs
));
tmp
[
j
][
i
]
+=
vec_dot_q_cuda
(
vx
,
&
y
[
j
*
blocks_per_col_y
+
kby
],
(
row0
+
i
)
*
blocks_per_row_x
+
kbx
,
kqs
);
}
}
}
__shared__
float
tmp_shared
[
nwarps
-
1
>
0
?
nwarps
-
1
:
1
][
ncols_y
][
rows_per_cuda_block
][
WARP_SIZE
];
if
(
threadIdx
.
y
>
0
)
{
#pragma unroll ncols_y
#pragma unroll ncols_y
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll rows_per_cuda_block
#pragma unroll rows_per_cuda_block
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
//tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
atomicExch
(
&
tmp_shared
[
threadIdx
.
y
-
1
][
j
][
i
][
threadIdx
.
x
],
tmp
[
j
][
i
]);
tmp_shared
[
threadIdx
.
y
-
1
][
j
][
i
][
threadIdx
.
x
]
=
tmp
[
j
][
i
];
}
}
}
...
...
@@ -139,28 +136,25 @@ static __global__ void mul_mat_vec_q(
}
// sum up partial sums and write back result
#pragma unroll ncols_y
#pragma unroll ncols_y
for
(
int
j
=
0
;
j
<
ncols_y
;
++
j
)
{
#pragma unroll rows_per_cuda_block
#pragma unroll rows_per_cuda_block
for
(
int
i
=
0
;
i
<
rows_per_cuda_block
;
++
i
)
{
#pragma unroll
#pragma unroll
for
(
int
l
=
0
;
l
<
nwarps
-
1
;
++
l
)
{
//tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
atomicAdd
(
&
tmp
[
j
][
i
],
tmp_shared
[
l
][
j
][
i
][
threadIdx
.
x
]);
tmp
[
j
][
i
]
+=
tmp_shared
[
l
][
j
][
i
][
threadIdx
.
x
];
}
//tmp[j][i] = warp_reduce_sum(tmp[j][i]);
atomicExch
(
&
tmp
[
j
][
i
],
warp_reduce_sum
(
tmp
[
j
][
i
]));
tmp
[
j
][
i
]
=
warp_reduce_sum
(
tmp
[
j
][
i
]);
}
if
(
threadIdx
.
x
<
rows_per_cuda_block
&&
(
rows_per_cuda_block
==
1
||
row0
+
threadIdx
.
x
<
nrows_dst
))
{
//
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
atomicExch
(
&
dst
[
j
*
nrows_dst
+
row0
+
threadIdx
.
x
],
tmp
[
j
][
threadIdx
.
x
]);
dst
[
j
*
nrows_dst
+
row0
+
threadIdx
.
x
]
=
tmp
[
j
][
threadIdx
.
x
];
}
}
}
}
template
<
ggml_type
type
>
static
void
mul_mat_vec_q_cuda
(
template
<
ggml_type
type
>
static
void
mul_mat_vec_q_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
...
...
@@ -182,7 +176,7 @@ static void mul_mat_vec_q_cuda(
case
3
:
case
4
:
nwarps
=
4
;
rows_per_cuda_block
=
2
;
rows_per_cuda_block
=
1
;
break
;
case
5
:
case
6
:
...
...
@@ -229,142 +223,142 @@ static void mul_mat_vec_q_cuda(
GGML_ABORT
(
"fatal error"
);
break
;
}
}
}
static
void
mul_mat_vec_q4_0_q8_1_cuda
(
static
void
mul_mat_vec_q4_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q4_0
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q4_1_q8_1_cuda
(
static
void
mul_mat_vec_q4_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q4_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q5_0_q8_1_cuda
(
static
void
mul_mat_vec_q5_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q5_0
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q5_1_q8_1_cuda
(
static
void
mul_mat_vec_q5_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q5_1
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q8_0_q8_1_cuda
(
static
void
mul_mat_vec_q8_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q8_0
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q2_K_q8_1_cuda
(
static
void
mul_mat_vec_q2_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q2_K
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q3_K_q8_1_cuda
(
static
void
mul_mat_vec_q3_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q3_K
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q4_K_q8_1_cuda
(
static
void
mul_mat_vec_q4_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q4_K
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q5_K_q8_1_cuda
(
static
void
mul_mat_vec_q5_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q5_K
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_q6_K_q8_1_cuda
(
static
void
mul_mat_vec_q6_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_Q6_K
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq2_xxs_q8_1_cuda
(
static
void
mul_mat_vec_iq2_xxs_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ2_XXS
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq2_xs_q8_1_cuda
(
static
void
mul_mat_vec_iq2_xs_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ2_XS
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq2_s_q8_1_cuda
(
static
void
mul_mat_vec_iq2_s_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ2_S
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq3_xxs_q8_1_cuda
(
static
void
mul_mat_vec_iq3_xxs_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ3_XXS
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq1_s_q8_1_cuda
(
static
void
mul_mat_vec_iq1_s_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ1_S
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq1_m_q8_1_cuda
(
static
void
mul_mat_vec_iq1_m_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ1_M
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq4_nl_q8_1_cuda
(
static
void
mul_mat_vec_iq4_nl_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ4_NL
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq4_xs_q8_1_cuda
(
static
void
mul_mat_vec_iq4_xs_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ4_XS
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
static
void
mul_mat_vec_iq3_s_q8_1_cuda
(
static
void
mul_mat_vec_iq3_s_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
ncols_y
,
const
int
nrows_dst
,
cudaStream_t
stream
)
{
mul_mat_vec_q_cuda
<
GGML_TYPE_IQ3_S
>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
nrows_y
,
ncols_y
,
nrows_dst
,
stream
);
}
}
void
ggml_cuda_op_mul_mat_vec_q
(
void
ggml_cuda_op_mul_mat_vec_q
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
char
*
src0_dd_i
,
const
float
*
src1_ddf_i
,
const
char
*
src1_ddq_i
,
float
*
dst_dd_i
,
const
int64_t
row_low
,
const
int64_t
row_high
,
const
int64_t
src1_ncols
,
...
...
@@ -452,4 +446,5 @@ void ggml_cuda_op_mul_mat_vec_q(
GGML_UNUSED
(
src1_ddf_i
);
GGML_UNUSED
(
src1_ncols
);
GGML_UNUSED
(
src1_padded_row_size
);
}
}
\ No newline at end of file
llama/ggml-cuda/quantize.cu
View file @
386c53bb
...
...
@@ -24,10 +24,10 @@
* SOFTWARE.
*/
#include "quantize.cuh"
#include <cstdint>
#include "quantize.cuh"
#include <cstdint>
static
__global__
__launch_bounds__
(
1024
)
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx
,
const
int64_t
kx0_padded
)
{
static
__global__
__launch_bounds__
(
1024
)
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx
,
const
int64_t
kx0_padded
)
{
const
int64_t
ix0
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
ix0
>=
kx0_padded
)
{
...
...
@@ -58,14 +58,14 @@ static __global__ __launch_bounds__(1024) void quantize_q8_1(const float * __res
if
(
iqs
>
0
)
{
return
;
}
ggml_half2
ds
=
{
d
,
sum
};
y
[
ib
].
ds
=
ds
;
y
[
ib
].
ds
=
ggml_half2
{
d
,
sum
}
;
//reinterpret_cast<half&>(y[ib].ds) = ds;
//reinterpret_cast<half&>(y[ib].ds.y) = sum;
}
}
template
<
mmq_q8_1_ds_layout
ds_layout
>
static
__global__
void
quantize_mmq_q8_1
(
template
<
mmq_q8_1_ds_layout
ds_layout
>
static
__global__
void
quantize_mmq_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
kx0_padded
)
{
constexpr
int
vals_per_scale
=
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
?
64
:
32
;
...
...
@@ -95,7 +95,7 @@ static __global__ void quantize_mmq_q8_1(
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
w
));
// Exchange max. abs. value between vals_per_scale/4 threads.
#pragma unroll
#pragma unroll
for
(
int
offset
=
vals_per_scale
/
8
;
offset
>
0
;
offset
>>=
1
)
{
amax
=
fmaxf
(
amax
,
__shfl_xor_sync
(
0xFFFFFFFF
,
amax
,
offset
,
WARP_SIZE
));
}
...
...
@@ -105,7 +105,7 @@ static __global__ void quantize_mmq_q8_1(
sum
=
xi
.
x
+
xi
.
y
+
xi
.
z
+
xi
.
w
;
// Exchange calculate sum across vals_per_sum/4 threads.
#pragma unroll
#pragma unroll
for
(
int
offset
=
vals_per_sum
/
8
;
offset
>
0
;
offset
>>=
1
)
{
sum
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
sum
,
offset
,
WARP_SIZE
);
}
...
...
@@ -151,9 +151,9 @@ static __global__ void quantize_mmq_q8_1(
}
else
{
y
[
ib
].
d4
[
iqs
/
32
]
=
d
;
}
}
}
void
quantize_row_q8_1_cuda
(
void
quantize_row_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
)
{
...
...
@@ -165,9 +165,9 @@ void quantize_row_q8_1_cuda(
quantize_q8_1
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx0_padded
);
GGML_UNUSED
(
type_x
);
}
}
void
quantize_mmq_q8_1_cuda
(
void
quantize_mmq_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
)
{
...
...
@@ -193,4 +193,5 @@ void quantize_mmq_q8_1_cuda(
GGML_ABORT
(
"fatal error"
);
break
;
}
}
}
\ No newline at end of file
llama/ggml-cuda/vecdotq.cuh
View file @
386c53bb
...
...
@@ -505,7 +505,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
float
sumf
=
0.0
f
;
#pragma unroll
#pragma unroll
QR6_K
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
const
int
sc
=
scales
[
4
*
i
];
...
...
@@ -803,7 +803,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
int
u
[
QR6_K
];
float
d8
[
QR6_K
];
#pragma unroll
#pragma unroll
QR6_K
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
u
[
i
]
=
get_int_b4
(
bq8_1
[
bq8_offset
+
2
*
i
].
qs
,
iqs
%
QI8_1
);
d8
[
i
]
=
__low2float
(
bq8_1
[
bq8_offset
+
2
*
i
].
ds
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment