Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
527cc978
Unverified
Commit
527cc978
authored
Dec 10, 2024
by
Jeffrey Morgan
Committed by
GitHub
Dec 10, 2024
Browse files
llama: update vendored code to commit 40c6d79f (#7875)
parent
a37f4a86
Changes
288
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
201 additions
and
766 deletions
+201
-766
llama/ggml-cuda/binbcast.cuh
llama/ggml-cuda/binbcast.cuh
+1
-1
llama/ggml-cuda/clamp.cu
llama/ggml-cuda/clamp.cu
+1
-1
llama/ggml-cuda/clamp.cuh
llama/ggml-cuda/clamp.cuh
+1
-1
llama/ggml-cuda/common.cuh
llama/ggml-cuda/common.cuh
+63
-40
llama/ggml-cuda/concat.cu
llama/ggml-cuda/concat.cu
+1
-1
llama/ggml-cuda/concat.cuh
llama/ggml-cuda/concat.cuh
+1
-1
llama/ggml-cuda/conv-transpose-1d.cu
llama/ggml-cuda/conv-transpose-1d.cu
+1
-1
llama/ggml-cuda/conv-transpose-1d.cuh
llama/ggml-cuda/conv-transpose-1d.cuh
+1
-1
llama/ggml-cuda/convert.cu
llama/ggml-cuda/convert.cu
+1
-1
llama/ggml-cuda/convert.cuh
llama/ggml-cuda/convert.cuh
+1
-1
llama/ggml-cuda/count-equal.cu
llama/ggml-cuda/count-equal.cu
+90
-0
llama/ggml-cuda/count-equal.cuh
llama/ggml-cuda/count-equal.cuh
+31
-0
llama/ggml-cuda/cpy.cu
llama/ggml-cuda/cpy.cu
+1
-1
llama/ggml-cuda/cpy.cuh
llama/ggml-cuda/cpy.cuh
+2
-2
llama/ggml-cuda/cross-entropy-loss.cu
llama/ggml-cuda/cross-entropy-loss.cu
+1
-1
llama/ggml-cuda/cross-entropy-loss.cuh
llama/ggml-cuda/cross-entropy-loss.cuh
+1
-1
llama/ggml-cuda/dequantize.cuh
llama/ggml-cuda/dequantize.cuh
+1
-1
llama/ggml-cuda/diagmask.cu
llama/ggml-cuda/diagmask.cu
+1
-1
llama/ggml-cuda/diagmask.cuh
llama/ggml-cuda/diagmask.cuh
+1
-1
llama/ggml-cuda/dmmv.cu
llama/ggml-cuda/dmmv.cu
+0
-709
No files found.
llama/ggml-cuda/binbcast.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/clamp.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/clamp.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/common.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -32,7 +32,7 @@
#include <cstdint>
#include <memory>
#if defined(GGML_USE_HIP
BLAS
)
#if defined(GGML_USE_HIP)
#define GGML_COMMON_DECL_HIP
#define GGML_COMMON_IMPL_HIP
#else
...
...
@@ -52,13 +52,13 @@
#include <string>
#include <vector>
#if defined(GGML_USE_HIP
BLAS
)
#if defined(GGML_USE_HIP)
#include "vendors/hip.h"
#elif defined(GGML_USE_MUSA)
#include "vendors/musa.h"
#else
#include "vendors/cuda.h"
#endif // defined(GGML_USE_HIP
BLAS
)
#endif // defined(GGML_USE_HIP)
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
...
...
@@ -73,9 +73,20 @@
#define CC_TURING 750
#define CC_AMPERE 800
#define CC_OFFSET_AMD 1000000
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
// GCN/CNDA, wave size is 64
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define CC_QY1 210
#define CC_QY2 220
...
...
@@ -123,7 +134,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#if !defined(GGML_USE_HIP
BLAS
)
#if !defined(GGML_USE_HIP)
static
const
char
*
cu_get_error_str
(
CUresult
err
)
{
const
char
*
err_str
;
cuGetErrorString
(
err
,
&
err_str
);
...
...
@@ -146,21 +157,21 @@ typedef float dfloat; // dequantize float
typedef
float2
dfloat2
;
#endif // GGML_CUDA_F16
#if (defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#define FP16_AVAILABLE
#endif // (defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#define FAST_FP16_AVAILABLE
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#if !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#define FP16_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#if !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
#define FLASH_ATTN_AVAILABLE
...
...
@@ -182,14 +193,14 @@ static constexpr bool int8_mma_available(const int cc) {
static
__device__
void
no_device_code
(
const
char
*
file_name
,
const
int
line
,
const
char
*
function_name
,
const
int
arch
,
const
char
*
arch_list
)
{
#if defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
printf
(
"%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.
\n
"
,
file_name
,
line
,
function_name
,
arch
);
GGML_UNUSED
(
arch_list
);
#else
printf
(
"%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s
\n
"
,
file_name
,
line
,
function_name
,
arch
,
arch_list
);
#endif // defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
__trap
();
GGML_UNUSED
(
no_device_code
);
// suppress unused function warning
...
...
@@ -201,19 +212,31 @@ static __device__ void no_device_code(
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static
__device__
__forceinline__
int
warp_reduce_sum
(
int
x
)
{
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
return
__reduce_add_sync
(
0xffffffff
,
x
);
#else
#pragma unroll
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
x
+=
__shfl_xor_sync
(
0xffffffff
,
x
,
offset
,
32
);
}
return
x
;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
}
static
__device__
__forceinline__
float
warp_reduce_sum
(
float
x
)
{
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
{
x
+=
__shfl_xor_sync
(
0xffffffff
,
x
,
mask
,
32
);
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
x
+=
__shfl_xor_sync
(
0xffffffff
,
x
,
offset
,
32
);
}
return
x
;
}
static
__device__
__forceinline__
float2
warp_reduce_sum
(
float2
a
)
{
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
{
a
.
x
+=
__shfl_xor_sync
(
0xffffffff
,
a
.
x
,
mask
,
32
);
a
.
y
+=
__shfl_xor_sync
(
0xffffffff
,
a
.
y
,
mask
,
32
);
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
a
.
x
+=
__shfl_xor_sync
(
0xffffffff
,
a
.
x
,
offset
,
32
);
a
.
y
+=
__shfl_xor_sync
(
0xffffffff
,
a
.
y
,
offset
,
32
);
}
return
a
;
}
...
...
@@ -221,21 +244,21 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
static
__device__
__forceinline__
half2
warp_reduce_sum
(
half2
a
)
{
#ifdef FP16_AVAILABLE
#if defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
{
const
half2
a_other
=
__shfl_xor_sync
(
0xffffffff
,
a
,
mask
,
32
);
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
const
half2
a_other
=
__shfl_xor_sync
(
0xffffffff
,
a
,
offset
,
32
);
reinterpret_cast
<
half
&>
(
a
.
x
)
+=
__low2half
(
a_other
);
reinterpret_cast
<
half
&>
(
a
.
y
)
+=
__high2half
(
a_other
);
}
return
a
;
#else
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
{
a
=
__hadd2
(
a
,
__shfl_xor_sync
(
0xffffffff
,
a
,
mask
,
32
));
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
a
=
__hadd2
(
a
,
__shfl_xor_sync
(
0xffffffff
,
a
,
offset
,
32
));
}
return
a
;
#endif // defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#else
NO_DEVICE_CODE
;
...
...
@@ -245,8 +268,8 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
static
__device__
__forceinline__
float
warp_reduce_max
(
float
x
)
{
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
{
x
=
fmaxf
(
x
,
__shfl_xor_sync
(
0xffffffff
,
x
,
mask
,
32
));
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
x
=
fmaxf
(
x
,
__shfl_xor_sync
(
0xffffffff
,
x
,
offset
,
32
));
}
return
x
;
}
...
...
@@ -254,11 +277,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
static
__device__
__forceinline__
half
ggml_cuda_hmax
(
const
half
a
,
const
half
b
)
{
#ifdef FP16_AVAILABLE
#if !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
return
__float2half
(
fmaxf
(
__half2float
(
a
),
__half2float
(
b
)));
#else
return
__hmax
(
a
,
b
);
#endif // !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
#else
NO_DEVICE_CODE
;
...
...
@@ -268,7 +291,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
}
static
__device__
__forceinline__
half2
ggml_cuda_hmax2
(
const
half2
a
,
const
half2
b
)
{
#if !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__))
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return
__hmax2
(
a
,
b
);
...
...
@@ -283,20 +306,20 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
GGML_UNUSED
(
a
);
GGML_UNUSED
(
b
);
NO_DEVICE_CODE
;
#endif // !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__))
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
}
static
__device__
__forceinline__
half2
warp_reduce_max
(
half2
x
)
{
#if !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
{
x
=
ggml_cuda_hmax2
(
x
,
__shfl_xor_sync
(
0xffffffff
,
x
,
mask
,
32
));
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
x
=
ggml_cuda_hmax2
(
x
,
__shfl_xor_sync
(
0xffffffff
,
x
,
offset
,
32
));
}
return
x
;
#else
GGML_UNUSED
(
x
);
NO_DEVICE_CODE
;
#endif // !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#if CUDART_VERSION < CUDART_HMASK
...
...
@@ -308,7 +331,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
#endif // CUDART_VERSION < CUDART_HMASK
static
__device__
__forceinline__
int
ggml_cuda_dp4a
(
const
int
a
,
const
int
b
,
int
c
)
{
#if defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
c
=
__builtin_amdgcn_sdot4
(
a
,
b
,
c
,
false
);
#elif defined(RDNA3)
...
...
@@ -334,7 +357,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
#endif
return
c
;
#else // defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= MIN_CC_DP4A
return
__dp4a
(
a
,
b
,
c
);
...
...
@@ -344,7 +367,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
return
c
+
a8
[
0
]
*
b8
[
0
]
+
a8
[
1
]
*
b8
[
1
]
+
a8
[
2
]
*
b8
[
2
]
+
a8
[
3
]
*
b8
[
3
];
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__)
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
}
// TODO: move to ggml-common.h
...
...
llama/ggml-cuda/concat.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/concat.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/conv-transpose-1d.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/conv-transpose-1d.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/convert.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/convert.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/count-equal.cu
0 → 100644
View file @
527cc978
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "common.cuh"
#include "count-equal.cuh"
#include <cstdint>
template
<
typename
T
>
static
__global__
void
count_equal
(
const
T
*
__restrict__
x
,
const
T
*
__restrict__
y
,
int64_t
*
__restrict__
dst
,
const
int64_t
dk
,
const
int64_t
k
)
{
const
int64_t
i0
=
(
int64_t
)
blockIdx
.
x
*
dk
;
const
int64_t
i1
=
min
(
i0
+
dk
,
k
);
int
nequal
=
0
;
for
(
int64_t
i
=
i0
+
threadIdx
.
x
;
i
<
i1
;
i
+=
WARP_SIZE
)
{
const
T
xi
=
x
[
i
];
const
T
yi
=
y
[
i
];
nequal
+=
xi
==
yi
;
}
nequal
=
warp_reduce_sum
(
nequal
);
if
(
threadIdx
.
x
!=
0
)
{
return
;
}
atomicAdd
((
int
*
)
dst
,
nequal
);
}
void
ggml_cuda_count_equal
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
GGML_ASSERT
(
src0
->
type
==
src1
->
type
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_I64
);
GGML_ASSERT
(
ggml_are_same_shape
(
src0
,
src1
));
GGML_ASSERT
(
ggml_is_contiguous
(
src0
));
GGML_ASSERT
(
ggml_is_contiguous
(
src1
));
GGML_ASSERT
(
ggml_is_contiguous
(
dst
));
int64_t
*
dst_d
=
(
int64_t
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
const
int
nsm
=
ggml_cuda_info
().
devices
[
ggml_cuda_get_device
()].
nsm
;
const
int64_t
ne
=
ggml_nelements
(
src0
);
GGML_ASSERT
(
ne
<
(
1
<<
30
)
&&
"atomicAdd implementation only supports int"
);
const
int64_t
dne
=
GGML_PAD
((
ne
+
4
*
nsm
-
1
)
/
(
4
*
nsm
),
CUDA_COUNT_EQUAL_CHUNK_SIZE
);
CUDA_CHECK
(
cudaMemsetAsync
(
dst_d
,
0
,
ggml_nbytes
(
dst
),
stream
));
const
dim3
blocks_dim
(
WARP_SIZE
,
1
,
1
);
const
dim3
blocks_num
(
std
::
min
((
int64_t
)
4
*
nsm
,
(
ne
+
CUDA_COUNT_EQUAL_CHUNK_SIZE
-
1
)
/
CUDA_COUNT_EQUAL_CHUNK_SIZE
),
1
,
1
);
switch
(
src0
->
type
)
{
case
GGML_TYPE_I32
:
{
const
int
*
src0_d
=
(
const
int
*
)
src0
->
data
;
const
int
*
src1_d
=
(
const
int
*
)
src1
->
data
;
count_equal
<<<
blocks_num
,
blocks_dim
,
0
,
stream
>>>
(
src0_d
,
src1_d
,
dst_d
,
dne
,
ne
);
}
break
;
default:
GGML_ASSERT
(
false
);
break
;
}
}
llama/ggml-cuda/count-equal.cuh
0 → 100644
View file @
527cc978
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "common.cuh"
#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
void
ggml_cuda_count_equal
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
llama/ggml-cuda/cpy.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/cpy.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -26,7 +26,7 @@
#include "common.cuh"
#define CUDA_CPY_BLOCK_SIZE
32
#define CUDA_CPY_BLOCK_SIZE
64
void
ggml_cuda_cpy
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
ggml_tensor
*
src1
);
...
...
llama/ggml-cuda/cross-entropy-loss.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/cross-entropy-loss.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/dequantize.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/diagmask.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/diagmask.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/dmmv.cu
deleted
100644 → 0
View file @
a37f4a86
/**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
*
* MIT License
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "dmmv.cuh"
#include "dequantize.cuh"
#include "convert.cuh"
#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 2
#else
static_assert
(
K_QUANTS_PER_ITERATION
==
1
||
K_QUANTS_PER_ITERATION
==
2
,
"K_QUANTS_PER_ITERATION must be 1 or 2"
);
#endif
static
__global__
void
dequantize_mul_mat_vec_q2_k
(
const
void
*
__restrict__
vx
,
const
float
*
__restrict__
yy
,
float
*
__restrict__
dst
,
const
int
ncols
,
int
nrows
)
{
static_assert
(
16
%
K_QUANTS_PER_ITERATION
==
0
,
"16 must be divisible by K_QUANTS_PER_ITERATION"
);
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
row
>
nrows
)
return
;
const
int
num_blocks_per_row
=
ncols
/
QK_K
;
const
int
ib0
=
row
*
num_blocks_per_row
;
const
block_q2_K
*
x
=
(
const
block_q2_K
*
)
vx
+
ib0
;
float
tmp
=
0
;
// partial sum for thread in warp
const
int
tid
=
threadIdx
.
x
/
K_QUANTS_PER_ITERATION
;
// 0...31 or 0...15
const
int
ix
=
threadIdx
.
x
%
K_QUANTS_PER_ITERATION
;
// 0 or 0,1
const
int
step
=
16
/
K_QUANTS_PER_ITERATION
;
const
int
im
=
tid
/
step
;
// 0 or 1. 0 computes 0..., 1 computes 128...
const
int
in
=
tid
-
step
*
im
;
// 0...15 or 0...7
const
int
l0
=
K_QUANTS_PER_ITERATION
*
in
;
// 0...15 or 0...14 in steps of 2
const
int
q_offset
=
32
*
im
+
l0
;
const
int
s_offset
=
8
*
im
;
const
int
y_offset
=
128
*
im
+
l0
;
uint32_t
aux
[
4
];
const
uint8_t
*
d
=
(
const
uint8_t
*
)
aux
;
const
uint8_t
*
m
=
(
const
uint8_t
*
)(
aux
+
2
);
for
(
int
i
=
ix
;
i
<
num_blocks_per_row
;
i
+=
K_QUANTS_PER_ITERATION
)
{
const
float
*
y
=
yy
+
i
*
QK_K
+
y_offset
;
const
uint8_t
*
q
=
x
[
i
].
qs
+
q_offset
;
const
float
dall
=
__low2half
(
x
[
i
].
dm
);
const
float
dmin
=
__high2half
(
x
[
i
].
dm
);
const
uint32_t
*
a
=
(
const
uint32_t
*
)(
x
[
i
].
scales
+
s_offset
);
aux
[
0
]
=
a
[
0
]
&
0x0f0f0f0f
;
aux
[
1
]
=
a
[
1
]
&
0x0f0f0f0f
;
aux
[
2
]
=
(
a
[
0
]
>>
4
)
&
0x0f0f0f0f
;
aux
[
3
]
=
(
a
[
1
]
>>
4
)
&
0x0f0f0f0f
;
float
sum1
=
0
,
sum2
=
0
;
for
(
int
l
=
0
;
l
<
K_QUANTS_PER_ITERATION
;
++
l
)
{
sum1
+=
y
[
l
+
0
]
*
d
[
0
]
*
((
q
[
l
+
0
]
>>
0
)
&
3
)
+
y
[
l
+
32
]
*
d
[
2
]
*
((
q
[
l
+
0
]
>>
2
)
&
3
)
+
y
[
l
+
64
]
*
d
[
4
]
*
((
q
[
l
+
0
]
>>
4
)
&
3
)
+
y
[
l
+
96
]
*
d
[
6
]
*
((
q
[
l
+
0
]
>>
6
)
&
3
)
+
y
[
l
+
16
]
*
d
[
1
]
*
((
q
[
l
+
16
]
>>
0
)
&
3
)
+
y
[
l
+
48
]
*
d
[
3
]
*
((
q
[
l
+
16
]
>>
2
)
&
3
)
+
y
[
l
+
80
]
*
d
[
5
]
*
((
q
[
l
+
16
]
>>
4
)
&
3
)
+
y
[
l
+
112
]
*
d
[
7
]
*
((
q
[
l
+
16
]
>>
6
)
&
3
);
sum2
+=
y
[
l
+
0
]
*
m
[
0
]
+
y
[
l
+
32
]
*
m
[
2
]
+
y
[
l
+
64
]
*
m
[
4
]
+
y
[
l
+
96
]
*
m
[
6
]
+
y
[
l
+
16
]
*
m
[
1
]
+
y
[
l
+
48
]
*
m
[
3
]
+
y
[
l
+
80
]
*
m
[
5
]
+
y
[
l
+
112
]
*
m
[
7
];
}
tmp
+=
dall
*
sum1
-
dmin
*
sum2
;
}
// sum up partial sums and write back result
tmp
=
warp_reduce_sum
(
tmp
);
if
(
threadIdx
.
x
==
0
)
{
dst
[
row
]
=
tmp
;
}
}
static
__global__
void
dequantize_mul_mat_vec_q3_k
(
const
void
*
__restrict__
vx
,
const
float
*
__restrict__
yy
,
float
*
__restrict__
dst
,
const
int
ncols
,
int
nrows
)
{
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
row
>
nrows
)
return
;
const
int
num_blocks_per_row
=
ncols
/
QK_K
;
const
int
ib0
=
row
*
num_blocks_per_row
;
const
block_q3_K
*
x
=
(
const
block_q3_K
*
)
vx
+
ib0
;
float
tmp
=
0
;
// partial sum for thread in warp
const
uint16_t
kmask1
=
0x0303
;
const
uint16_t
kmask2
=
0x0f0f
;
const
int
tid
=
threadIdx
.
x
/
K_QUANTS_PER_ITERATION
;
// 0...31 or 0...16
const
int
ix
=
threadIdx
.
x
%
K_QUANTS_PER_ITERATION
;
// 0 or 0,1
const
int
n
=
K_QUANTS_PER_ITERATION
;
// iterations in the inner loop
const
int
step
=
16
/
K_QUANTS_PER_ITERATION
;
const
int
im
=
tid
/
step
;
// 0 or 1. 0 computes 0..., 1 computes 128...
const
int
in
=
tid
-
step
*
im
;
// 0....15 or 0...7
const
uint8_t
m
=
1
<<
(
4
*
im
);
const
int
l0
=
n
*
in
;
// 0...15 or 0...14 in steps of 2
const
int
q_offset
=
32
*
im
+
l0
;
const
int
y_offset
=
128
*
im
+
l0
;
uint16_t
utmp
[
4
];
const
int8_t
*
s
=
(
const
int8_t
*
)
utmp
;
const
uint16_t
s_shift
=
4
*
im
;
for
(
int
i
=
ix
;
i
<
num_blocks_per_row
;
i
+=
K_QUANTS_PER_ITERATION
)
{
const
float
*
y
=
yy
+
i
*
QK_K
+
y_offset
;
const
uint8_t
*
q
=
x
[
i
].
qs
+
q_offset
;
const
uint8_t
*
h
=
x
[
i
].
hmask
+
l0
;
const
uint16_t
*
a
=
(
const
uint16_t
*
)
x
[
i
].
scales
;
utmp
[
0
]
=
((
a
[
0
]
>>
s_shift
)
&
kmask2
)
|
(((
a
[
4
]
>>
(
s_shift
+
0
))
&
kmask1
)
<<
4
);
utmp
[
1
]
=
((
a
[
1
]
>>
s_shift
)
&
kmask2
)
|
(((
a
[
5
]
>>
(
s_shift
+
0
))
&
kmask1
)
<<
4
);
utmp
[
2
]
=
((
a
[
2
]
>>
s_shift
)
&
kmask2
)
|
(((
a
[
4
]
>>
(
s_shift
+
2
))
&
kmask1
)
<<
4
);
utmp
[
3
]
=
((
a
[
3
]
>>
s_shift
)
&
kmask2
)
|
(((
a
[
5
]
>>
(
s_shift
+
2
))
&
kmask1
)
<<
4
);
const
float
d
=
x
[
i
].
d
;
float
sum
=
0
;
for
(
int
l
=
0
;
l
<
n
;
++
l
)
{
sum
+=
y
[
l
+
0
]
*
(
s
[
0
]
-
32
)
*
(((
q
[
l
]
>>
0
)
&
3
)
-
(
h
[
l
]
&
(
m
<<
0
)
?
0
:
4
))
+
y
[
l
+
32
]
*
(
s
[
2
]
-
32
)
*
(((
q
[
l
]
>>
2
)
&
3
)
-
(
h
[
l
]
&
(
m
<<
1
)
?
0
:
4
))
+
y
[
l
+
64
]
*
(
s
[
4
]
-
32
)
*
(((
q
[
l
]
>>
4
)
&
3
)
-
(
h
[
l
]
&
(
m
<<
2
)
?
0
:
4
))
+
y
[
l
+
96
]
*
(
s
[
6
]
-
32
)
*
(((
q
[
l
]
>>
6
)
&
3
)
-
(
h
[
l
]
&
(
m
<<
3
)
?
0
:
4
));
sum
+=
y
[
l
+
16
]
*
(
s
[
1
]
-
32
)
*
(((
q
[
l
+
16
]
>>
0
)
&
3
)
-
(
h
[
l
+
16
]
&
(
m
<<
0
)
?
0
:
4
))
+
y
[
l
+
48
]
*
(
s
[
3
]
-
32
)
*
(((
q
[
l
+
16
]
>>
2
)
&
3
)
-
(
h
[
l
+
16
]
&
(
m
<<
1
)
?
0
:
4
))
+
y
[
l
+
80
]
*
(
s
[
5
]
-
32
)
*
(((
q
[
l
+
16
]
>>
4
)
&
3
)
-
(
h
[
l
+
16
]
&
(
m
<<
2
)
?
0
:
4
))
+
y
[
l
+
112
]
*
(
s
[
7
]
-
32
)
*
(((
q
[
l
+
16
]
>>
6
)
&
3
)
-
(
h
[
l
+
16
]
&
(
m
<<
3
)
?
0
:
4
));
}
tmp
+=
d
*
sum
;
}
// sum up partial sums and write back result
tmp
=
warp_reduce_sum
(
tmp
);
if
(
threadIdx
.
x
==
0
)
{
dst
[
row
]
=
tmp
;
}
}
static
__global__
void
dequantize_mul_mat_vec_q4_k
(
const
void
*
__restrict__
vx
,
const
float
*
__restrict__
yy
,
float
*
__restrict__
dst
,
const
int
ncols
,
int
nrows
)
{
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
row
>
nrows
)
return
;
const
int
num_blocks_per_row
=
ncols
/
QK_K
;
const
int
ib0
=
row
*
num_blocks_per_row
;
const
block_q4_K
*
x
=
(
const
block_q4_K
*
)
vx
+
ib0
;
const
uint16_t
kmask1
=
0x3f3f
;
const
uint16_t
kmask2
=
0x0f0f
;
const
uint16_t
kmask3
=
0xc0c0
;
const
int
tid
=
threadIdx
.
x
/
K_QUANTS_PER_ITERATION
;
// 0...31 or 0...16
const
int
ix
=
threadIdx
.
x
%
K_QUANTS_PER_ITERATION
;
// 0 or 0,1
const
int
step
=
8
/
K_QUANTS_PER_ITERATION
;
// 8 or 4
const
int
il
=
tid
/
step
;
// 0...3
const
int
ir
=
tid
-
step
*
il
;
// 0...7 or 0...3
const
int
n
=
2
*
K_QUANTS_PER_ITERATION
;
// 2 or 4
const
int
im
=
il
/
2
;
// 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const
int
in
=
il
%
2
;
const
int
l0
=
n
*
(
2
*
ir
+
in
);
const
int
q_offset
=
32
*
im
+
l0
;
const
int
y_offset
=
64
*
im
+
l0
;
uint16_t
aux
[
4
];
const
uint8_t
*
sc
=
(
const
uint8_t
*
)
aux
;
#if K_QUANTS_PER_ITERATION == 2
uint32_t
q32
[
4
];
const
uint8_t
*
q4
=
(
const
uint8_t
*
)
q32
;
#else
uint16_t
q16
[
4
];
const
uint8_t
*
q4
=
(
const
uint8_t
*
)
q16
;
#endif
float
tmp
=
0
;
// partial sum for thread in warp
for
(
int
i
=
ix
;
i
<
num_blocks_per_row
;
i
+=
K_QUANTS_PER_ITERATION
)
{
const
float
*
y1
=
yy
+
i
*
QK_K
+
y_offset
;
const
float
*
y2
=
y1
+
128
;
const
float
dall
=
__low2half
(
x
[
i
].
dm
);
const
float
dmin
=
__high2half
(
x
[
i
].
dm
);
const
uint16_t
*
a
=
(
const
uint16_t
*
)
x
[
i
].
scales
;
aux
[
0
]
=
a
[
im
+
0
]
&
kmask1
;
aux
[
1
]
=
a
[
im
+
2
]
&
kmask1
;
aux
[
2
]
=
((
a
[
im
+
4
]
>>
0
)
&
kmask2
)
|
((
a
[
im
+
0
]
&
kmask3
)
>>
2
);
aux
[
3
]
=
((
a
[
im
+
4
]
>>
4
)
&
kmask2
)
|
((
a
[
im
+
2
]
&
kmask3
)
>>
2
);
#if K_QUANTS_PER_ITERATION == 2
const
uint32_t
*
q1
=
(
const
uint32_t
*
)(
x
[
i
].
qs
+
q_offset
);
const
uint32_t
*
q2
=
q1
+
16
;
q32
[
0
]
=
q1
[
0
]
&
0x0f0f0f0f
;
q32
[
1
]
=
q1
[
0
]
&
0xf0f0f0f0
;
q32
[
2
]
=
q2
[
0
]
&
0x0f0f0f0f
;
q32
[
3
]
=
q2
[
0
]
&
0xf0f0f0f0
;
float4
s
=
{
0.
f
,
0.
f
,
0.
f
,
0.
f
};
float
smin
=
0
;
for
(
int
l
=
0
;
l
<
4
;
++
l
)
{
s
.
x
+=
y1
[
l
]
*
q4
[
l
+
0
];
s
.
y
+=
y1
[
l
+
32
]
*
q4
[
l
+
4
];
s
.
z
+=
y2
[
l
]
*
q4
[
l
+
8
];
s
.
w
+=
y2
[
l
+
32
]
*
q4
[
l
+
12
];
smin
+=
y1
[
l
]
*
sc
[
2
]
+
y1
[
l
+
32
]
*
sc
[
3
]
+
y2
[
l
]
*
sc
[
6
]
+
y2
[
l
+
32
]
*
sc
[
7
];
}
tmp
+=
dall
*
(
s
.
x
*
sc
[
0
]
+
s
.
y
*
sc
[
1
]
*
1.
f
/
16.
f
+
s
.
z
*
sc
[
4
]
+
s
.
w
*
sc
[
5
]
*
1.
f
/
16.
f
)
-
dmin
*
smin
;
#else
const
uint16_t
*
q1
=
(
const
uint16_t
*
)(
x
[
i
].
qs
+
q_offset
);
const
uint16_t
*
q2
=
q1
+
32
;
q16
[
0
]
=
q1
[
0
]
&
0x0f0f
;
q16
[
1
]
=
q1
[
0
]
&
0xf0f0
;
q16
[
2
]
=
q2
[
0
]
&
0x0f0f
;
q16
[
3
]
=
q2
[
0
]
&
0xf0f0
;
float4
s
=
{
0.
f
,
0.
f
,
0.
f
,
0.
f
};
float
smin
=
0
;
for
(
int
l
=
0
;
l
<
2
;
++
l
)
{
s
.
x
+=
y1
[
l
]
*
q4
[
l
+
0
];
s
.
y
+=
y1
[
l
+
32
]
*
q4
[
l
+
2
];
s
.
z
+=
y2
[
l
]
*
q4
[
l
+
4
];
s
.
w
+=
y2
[
l
+
32
]
*
q4
[
l
+
6
];
smin
+=
y1
[
l
]
*
sc
[
2
]
+
y1
[
l
+
32
]
*
sc
[
3
]
+
y2
[
l
]
*
sc
[
6
]
+
y2
[
l
+
32
]
*
sc
[
7
];
}
tmp
+=
dall
*
(
s
.
x
*
sc
[
0
]
+
s
.
y
*
sc
[
1
]
*
1.
f
/
16.
f
+
s
.
z
*
sc
[
4
]
+
s
.
w
*
sc
[
5
]
*
1.
f
/
16.
f
)
-
dmin
*
smin
;
#endif
}
// sum up partial sums and write back result
tmp
=
warp_reduce_sum
(
tmp
);
if
(
tid
==
0
)
{
dst
[
row
]
=
tmp
;
}
}
static
__global__
void
dequantize_mul_mat_vec_q5_k
(
const
void
*
__restrict__
vx
,
const
float
*
__restrict__
yy
,
float
*
__restrict__
dst
,
const
int
ncols
)
{
const
int
row
=
blockIdx
.
x
;
const
int
num_blocks_per_row
=
ncols
/
QK_K
;
const
int
ib0
=
row
*
num_blocks_per_row
;
const
block_q5_K
*
x
=
(
const
block_q5_K
*
)
vx
+
ib0
;
float
tmp
=
0
;
// partial sum for thread in warp
const
uint16_t
kmask1
=
0x3f3f
;
const
uint16_t
kmask2
=
0x0f0f
;
const
uint16_t
kmask3
=
0xc0c0
;
const
int
tid
=
threadIdx
.
x
/
2
;
// 0...15
const
int
ix
=
threadIdx
.
x
%
2
;
const
int
il
=
tid
/
4
;
// 0...3
const
int
ir
=
tid
-
4
*
il
;
// 0...3
const
int
n
=
2
;
const
int
im
=
il
/
2
;
// 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const
int
in
=
il
%
2
;
const
int
l0
=
n
*
(
2
*
ir
+
in
);
const
int
q_offset
=
32
*
im
+
l0
;
const
int
y_offset
=
64
*
im
+
l0
;
const
uint8_t
hm1
=
1
<<
(
2
*
im
);
const
uint8_t
hm2
=
hm1
<<
4
;
uint16_t
aux
[
4
];
const
uint8_t
*
sc
=
(
const
uint8_t
*
)
aux
;
uint16_t
q16
[
8
];
const
uint8_t
*
q4
=
(
const
uint8_t
*
)
q16
;
for
(
int
i
=
ix
;
i
<
num_blocks_per_row
;
i
+=
2
)
{
const
uint8_t
*
ql1
=
x
[
i
].
qs
+
q_offset
;
const
uint8_t
*
qh
=
x
[
i
].
qh
+
l0
;
const
float
*
y1
=
yy
+
i
*
QK_K
+
y_offset
;
const
float
*
y2
=
y1
+
128
;
const
float
dall
=
__low2half
(
x
[
i
].
dm
);
const
float
dmin
=
__high2half
(
x
[
i
].
dm
);
const
uint16_t
*
a
=
(
const
uint16_t
*
)
x
[
i
].
scales
;
aux
[
0
]
=
a
[
im
+
0
]
&
kmask1
;
aux
[
1
]
=
a
[
im
+
2
]
&
kmask1
;
aux
[
2
]
=
((
a
[
im
+
4
]
>>
0
)
&
kmask2
)
|
((
a
[
im
+
0
]
&
kmask3
)
>>
2
);
aux
[
3
]
=
((
a
[
im
+
4
]
>>
4
)
&
kmask2
)
|
((
a
[
im
+
2
]
&
kmask3
)
>>
2
);
float4
sum
=
{
0.
f
,
0.
f
,
0.
f
,
0.
f
};
float
smin
=
0
;
const
uint16_t
*
q1
=
(
const
uint16_t
*
)
ql1
;
const
uint16_t
*
q2
=
q1
+
32
;
q16
[
0
]
=
q1
[
0
]
&
0x0f0f
;
q16
[
1
]
=
q1
[
8
]
&
0x0f0f
;
q16
[
2
]
=
(
q1
[
0
]
>>
4
)
&
0x0f0f
;
q16
[
3
]
=
(
q1
[
8
]
>>
4
)
&
0x0f0f
;
q16
[
4
]
=
q2
[
0
]
&
0x0f0f
;
q16
[
5
]
=
q2
[
8
]
&
0x0f0f
;
q16
[
6
]
=
(
q2
[
0
]
>>
4
)
&
0x0f0f
;
q16
[
7
]
=
(
q2
[
8
]
>>
4
)
&
0x0f0f
;
for
(
int
l
=
0
;
l
<
n
;
++
l
)
{
sum
.
x
+=
y1
[
l
+
0
]
*
(
q4
[
l
+
0
]
+
(
qh
[
l
+
0
]
&
(
hm1
<<
0
)
?
16
:
0
))
+
y1
[
l
+
16
]
*
(
q4
[
l
+
2
]
+
(
qh
[
l
+
16
]
&
(
hm1
<<
0
)
?
16
:
0
));
sum
.
y
+=
y1
[
l
+
32
]
*
(
q4
[
l
+
4
]
+
(
qh
[
l
+
0
]
&
(
hm1
<<
1
)
?
16
:
0
))
+
y1
[
l
+
48
]
*
(
q4
[
l
+
6
]
+
(
qh
[
l
+
16
]
&
(
hm1
<<
1
)
?
16
:
0
));
sum
.
z
+=
y2
[
l
+
0
]
*
(
q4
[
l
+
8
]
+
(
qh
[
l
+
0
]
&
(
hm2
<<
0
)
?
16
:
0
))
+
y2
[
l
+
16
]
*
(
q4
[
l
+
10
]
+
(
qh
[
l
+
16
]
&
(
hm2
<<
0
)
?
16
:
0
));
sum
.
w
+=
y2
[
l
+
32
]
*
(
q4
[
l
+
12
]
+
(
qh
[
l
+
0
]
&
(
hm2
<<
1
)
?
16
:
0
))
+
y2
[
l
+
48
]
*
(
q4
[
l
+
14
]
+
(
qh
[
l
+
16
]
&
(
hm2
<<
1
)
?
16
:
0
));
smin
+=
(
y1
[
l
]
+
y1
[
l
+
16
])
*
sc
[
2
]
+
(
y1
[
l
+
32
]
+
y1
[
l
+
48
])
*
sc
[
3
]
+
(
y2
[
l
]
+
y2
[
l
+
16
])
*
sc
[
6
]
+
(
y2
[
l
+
32
]
+
y2
[
l
+
48
])
*
sc
[
7
];
}
tmp
+=
dall
*
(
sum
.
x
*
sc
[
0
]
+
sum
.
y
*
sc
[
1
]
+
sum
.
z
*
sc
[
4
]
+
sum
.
w
*
sc
[
5
])
-
dmin
*
smin
;
}
// sum up partial sums and write back result
tmp
=
warp_reduce_sum
(
tmp
);
if
(
threadIdx
.
x
==
0
)
{
dst
[
row
]
=
tmp
;
}
}
static
__global__
void
dequantize_mul_mat_vec_q6_k
(
const
void
*
__restrict__
vx
,
const
float
*
__restrict__
yy
,
float
*
__restrict__
dst
,
const
int
ncols
,
int
nrows
)
{
static_assert
(
16
%
K_QUANTS_PER_ITERATION
==
0
,
"16 must be divisible by K_QUANTS_PER_ITERATION"
);
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
row
>
nrows
)
return
;
const
int
num_blocks_per_row
=
ncols
/
QK_K
;
const
int
ib0
=
row
*
num_blocks_per_row
;
const
block_q6_K
*
x
=
(
const
block_q6_K
*
)
vx
+
ib0
;
const
int
tid
=
threadIdx
.
x
/
K_QUANTS_PER_ITERATION
;
// 0...31 or 0...16
const
int
ix
=
threadIdx
.
x
%
K_QUANTS_PER_ITERATION
;
// 0 or 0, 1
const
int
step
=
16
/
K_QUANTS_PER_ITERATION
;
// 16 or 8
const
int
im
=
tid
/
step
;
// 0 or 1. 0 computes 0..., 1 computes 128...
const
int
in
=
tid
-
step
*
im
;
// 0...15 or 0...7
#if K_QUANTS_PER_ITERATION == 1
const
int
l0
=
K_QUANTS_PER_ITERATION
*
in
;
// 0...15
const
int
is
=
0
;
#else
const
int
l0
=
4
*
in
;
// 0, 4, 8, ..., 28
const
int
is
=
in
/
4
;
#endif
const
int
ql_offset
=
64
*
im
+
l0
;
const
int
qh_offset
=
32
*
im
+
l0
;
const
int
s_offset
=
8
*
im
+
is
;
const
int
y_offset
=
128
*
im
+
l0
;
float
tmp
=
0
;
// partial sum for thread in warp
for
(
int
i
=
ix
;
i
<
num_blocks_per_row
;
i
+=
K_QUANTS_PER_ITERATION
)
{
const
float
*
y
=
yy
+
i
*
QK_K
+
y_offset
;
const
uint8_t
*
ql
=
x
[
i
].
ql
+
ql_offset
;
const
uint8_t
*
qh
=
x
[
i
].
qh
+
qh_offset
;
const
int8_t
*
s
=
x
[
i
].
scales
+
s_offset
;
const
float
d
=
x
[
i
].
d
;
#if K_QUANTS_PER_ITERATION == 1
float
sum
=
y
[
0
]
*
s
[
0
]
*
d
*
((
int8_t
)((
ql
[
0
]
&
0xF
)
|
((
qh
[
0
]
&
0x03
)
<<
4
))
-
32
)
+
y
[
16
]
*
s
[
1
]
*
d
*
((
int8_t
)((
ql
[
16
]
&
0xF
)
|
((
qh
[
16
]
&
0x03
)
<<
4
))
-
32
)
+
y
[
32
]
*
s
[
2
]
*
d
*
((
int8_t
)((
ql
[
32
]
&
0xF
)
|
((
qh
[
0
]
&
0x0c
)
<<
2
))
-
32
)
+
y
[
48
]
*
s
[
3
]
*
d
*
((
int8_t
)((
ql
[
48
]
&
0xF
)
|
((
qh
[
16
]
&
0x0c
)
<<
2
))
-
32
)
+
y
[
64
]
*
s
[
4
]
*
d
*
((
int8_t
)((
ql
[
0
]
>>
4
)
|
((
qh
[
0
]
&
0x30
)
>>
0
))
-
32
)
+
y
[
80
]
*
s
[
5
]
*
d
*
((
int8_t
)((
ql
[
16
]
>>
4
)
|
((
qh
[
16
]
&
0x30
)
>>
0
))
-
32
)
+
y
[
96
]
*
s
[
6
]
*
d
*
((
int8_t
)((
ql
[
32
]
>>
4
)
|
((
qh
[
0
]
&
0xc0
)
>>
2
))
-
32
)
+
y
[
112
]
*
s
[
7
]
*
d
*
((
int8_t
)((
ql
[
48
]
>>
4
)
|
((
qh
[
16
]
&
0xc0
)
>>
2
))
-
32
);
tmp
+=
sum
;
#else
float
sum
=
0
;
for
(
int
l
=
0
;
l
<
4
;
++
l
)
{
sum
+=
y
[
l
+
0
]
*
s
[
0
]
*
d
*
((
int8_t
)((
ql
[
l
+
0
]
&
0xF
)
|
(((
qh
[
l
]
>>
0
)
&
3
)
<<
4
))
-
32
)
+
y
[
l
+
32
]
*
s
[
2
]
*
d
*
((
int8_t
)((
ql
[
l
+
32
]
&
0xF
)
|
(((
qh
[
l
]
>>
2
)
&
3
)
<<
4
))
-
32
)
+
y
[
l
+
64
]
*
s
[
4
]
*
d
*
((
int8_t
)((
ql
[
l
+
0
]
>>
4
)
|
(((
qh
[
l
]
>>
4
)
&
3
)
<<
4
))
-
32
)
+
y
[
l
+
96
]
*
s
[
6
]
*
d
*
((
int8_t
)((
ql
[
l
+
32
]
>>
4
)
|
(((
qh
[
l
]
>>
6
)
&
3
)
<<
4
))
-
32
);
}
tmp
+=
sum
;
#endif
}
// sum up partial sums and write back result
tmp
=
warp_reduce_sum
(
tmp
);
if
(
tid
==
0
)
{
dst
[
row
]
=
tmp
;
}
}
static
__device__
void
convert_f16
(
const
void
*
vx
,
const
int64_t
ib
,
const
int
iqs
,
dfloat2
&
v
){
const
half
*
x
=
(
const
half
*
)
vx
;
// automatic half -> float type cast if dfloat == float
v
.
x
=
x
[
ib
+
iqs
+
0
];
v
.
y
=
x
[
ib
+
iqs
+
1
];
}
static
constexpr
__device__
dequantize_kernel_t
get_dequantize_kernel
(
ggml_type
type
)
{
return
type
==
GGML_TYPE_Q4_0
?
dequantize_q4_0
:
type
==
GGML_TYPE_Q4_1
?
dequantize_q4_1
:
type
==
GGML_TYPE_Q5_0
?
dequantize_q5_0
:
type
==
GGML_TYPE_Q5_1
?
dequantize_q5_1
:
type
==
GGML_TYPE_Q8_0
?
dequantize_q8_0
:
type
==
GGML_TYPE_F16
?
convert_f16
:
nullptr
;
}
template
<
ggml_type
type
>
static
__global__
void
dequantize_mul_mat_vec
(
const
void
*
__restrict__
vx
,
const
dfloat
*
__restrict__
y
,
float
*
__restrict__
dst
,
const
int
ncols
,
const
int
nrows
)
{
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
// quantized weights per x block
constexpr
int
qr
=
ggml_cuda_type_traits
<
type
>::
qr
;
// number of quantized weights per data value in x block
constexpr
dequantize_kernel_t
dequantize_kernel
=
get_dequantize_kernel
(
type
);
const
int64_t
row
=
(
int64_t
)
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
row
>=
nrows
)
{
return
;
}
const
int
tid
=
threadIdx
.
x
;
const
int
iter_stride
=
2
*
GGML_CUDA_DMMV_X
;
const
int
vals_per_iter
=
iter_stride
/
WARP_SIZE
;
// num quantized vals per thread and i iter
const
int
y_offset
=
qr
==
1
?
1
:
qk
/
2
;
// partial sum for each thread
#ifdef GGML_CUDA_F16
half2
tmp
=
{
0.0
f
,
0.0
f
};
// two sums for f16 to take advantage of half2 intrinsics
#else
float
tmp
=
0.0
f
;
#endif // GGML_CUDA_F16
for
(
int
i
=
0
;
i
<
ncols
;
i
+=
iter_stride
)
{
const
int
col
=
i
+
vals_per_iter
*
tid
;
const
int64_t
ib
=
((
int64_t
)
row
*
ncols
+
col
)
/
qk
;
// x block index
const
int
iqs
=
(
col
%
qk
)
/
qr
;
// x quant index
const
int
iybs
=
col
-
col
%
qk
;
// y block start index
// processing >2 values per i iter is faster for fast GPUs
#pragma unroll
for
(
int
j
=
0
;
j
<
vals_per_iter
;
j
+=
2
)
{
// process 2 vals per j iter
// dequantize
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
dfloat2
v
;
dequantize_kernel
(
vx
,
ib
,
iqs
+
j
/
qr
,
v
);
// matrix multiplication
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
#ifdef GGML_CUDA_F16
tmp
+=
__hmul2
(
v
,
{
y
[
iybs
+
iqs
+
j
/
qr
+
0
],
y
[
iybs
+
iqs
+
j
/
qr
+
y_offset
]
});
#else
tmp
+=
v
.
x
*
y
[
iybs
+
iqs
+
j
/
qr
+
0
];
tmp
+=
v
.
y
*
y
[
iybs
+
iqs
+
j
/
qr
+
y_offset
];
#endif // GGML_CUDA_F16
}
}
// sum up partial sums and write back result
tmp
=
warp_reduce_sum
(
tmp
);
if
(
tid
==
0
)
{
#ifdef GGML_CUDA_F16
dst
[
row
]
=
tmp
.
x
+
tmp
.
y
;
#else
dst
[
row
]
=
tmp
;
#endif // GGML_CUDA_F16
}
}
static
void
dequantize_mul_mat_vec_q4_0_cuda
(
const
void
*
vx
,
const
dfloat
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
(
GGML_CUDA_DMMV_X
*
2
)
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
dequantize_mul_mat_vec
<
GGML_TYPE_Q4_0
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
dequantize_mul_mat_vec_q4_1_cuda
(
const
void
*
vx
,
const
dfloat
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
(
GGML_CUDA_DMMV_X
*
2
)
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
dequantize_mul_mat_vec
<
GGML_TYPE_Q4_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
dequantize_mul_mat_vec_q5_0_cuda
(
const
void
*
vx
,
const
dfloat
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
(
GGML_CUDA_DMMV_X
*
2
)
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
dequantize_mul_mat_vec
<
GGML_TYPE_Q5_0
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
dequantize_mul_mat_vec_q5_1_cuda
(
const
void
*
vx
,
const
dfloat
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
(
GGML_CUDA_DMMV_X
*
2
)
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
dequantize_mul_mat_vec
<
GGML_TYPE_Q5_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
dequantize_mul_mat_vec_q8_0_cuda
(
const
void
*
vx
,
const
dfloat
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
(
GGML_CUDA_DMMV_X
*
2
)
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
dequantize_mul_mat_vec
<
GGML_TYPE_Q8_0
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
dequantize_mul_mat_vec_q2_K_cuda
(
const
void
*
vx
,
const
float
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
ny
=
2
;
// very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
const
int
block_num_y
=
(
nrows
+
ny
-
1
)
/
ny
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
32
,
ny
,
1
);
dequantize_mul_mat_vec_q2_k
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
dequantize_mul_mat_vec_q3_K_cuda
(
const
void
*
vx
,
const
float
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
ny
=
2
/
K_QUANTS_PER_ITERATION
;
const
int
block_num_y
=
(
nrows
+
ny
-
1
)
/
ny
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
32
,
ny
,
1
);
dequantize_mul_mat_vec_q3_k
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
dequantize_mul_mat_vec_q4_K_cuda
(
const
void
*
vx
,
const
float
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
ny
=
2
/
K_QUANTS_PER_ITERATION
;
const
int
block_num_y
=
(
nrows
+
ny
-
1
)
/
ny
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
32
,
ny
,
1
);
dequantize_mul_mat_vec_q4_k
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
dequantize_mul_mat_vec_q5_K_cuda
(
const
void
*
vx
,
const
float
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
dim3
block_dims
(
32
,
1
,
1
);
dequantize_mul_mat_vec_q5_k
<<<
nrows
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
);
}
static
void
dequantize_mul_mat_vec_q6_K_cuda
(
const
void
*
vx
,
const
float
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
ny
=
2
/
K_QUANTS_PER_ITERATION
;
const
int
block_num_y
=
(
nrows
+
ny
-
1
)
/
ny
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
32
,
ny
,
1
);
dequantize_mul_mat_vec_q6_k
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
static
void
convert_mul_mat_vec_f16_cuda
(
const
void
*
vx
,
const
dfloat
*
y
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
(
GGML_CUDA_DMMV_X
*
2
)
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
block_num_y
,
1
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
dequantize_mul_mat_vec
<
GGML_TYPE_F16
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
y
,
dst
,
ncols
,
nrows
);
}
void
ggml_cuda_op_dequantize_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
char
*
src0_dd_i
,
const
float
*
src1_ddf_i
,
const
char
*
src1_ddq_i
,
float
*
dst_dd_i
,
const
int64_t
row_low
,
const
int64_t
row_high
,
const
int64_t
src1_ncols
,
const
int64_t
src1_padded_row_size
,
cudaStream_t
stream
)
{
GGML_UNUSED
(
ctx
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
row_diff
=
row_high
-
row_low
;
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_F32
);
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
#ifdef GGML_CUDA_F16
ggml_cuda_pool_alloc
<
half
>
src1_dfloat_a
(
ctx
.
pool
());
half
*
src1_dfloat
=
nullptr
;
// dfloat == half
bool
src1_convert_f16
=
src0
->
type
==
GGML_TYPE_Q4_0
||
src0
->
type
==
GGML_TYPE_Q4_1
||
src0
->
type
==
GGML_TYPE_Q5_0
||
src0
->
type
==
GGML_TYPE_Q5_1
||
src0
->
type
==
GGML_TYPE_Q8_0
||
src0
->
type
==
GGML_TYPE_F16
;
if
(
src1_convert_f16
)
{
src1_dfloat
=
src1_dfloat_a
.
alloc
(
ne00
);
const
to_fp16_cuda_t
to_fp16_cuda
=
ggml_get_to_fp16_cuda
(
src1
->
type
);
GGML_ASSERT
(
to_fp16_cuda
!=
nullptr
);
to_fp16_cuda
(
src1_ddf_i
,
src1_dfloat
,
ne00
,
stream
);
}
#else
const
dfloat
*
src1_dfloat
=
(
const
dfloat
*
)
src1_ddf_i
;
// dfloat == float, no conversion
#endif // GGML_CUDA_F16
switch
(
src0
->
type
)
{
case
GGML_TYPE_Q4_0
:
dequantize_mul_mat_vec_q4_0_cuda
(
src0_dd_i
,
src1_dfloat
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q4_1
:
dequantize_mul_mat_vec_q4_1_cuda
(
src0_dd_i
,
src1_dfloat
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q5_0
:
dequantize_mul_mat_vec_q5_0_cuda
(
src0_dd_i
,
src1_dfloat
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q5_1
:
dequantize_mul_mat_vec_q5_1_cuda
(
src0_dd_i
,
src1_dfloat
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q8_0
:
dequantize_mul_mat_vec_q8_0_cuda
(
src0_dd_i
,
src1_dfloat
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q2_K
:
dequantize_mul_mat_vec_q2_K_cuda
(
src0_dd_i
,
src1_ddf_i
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q3_K
:
dequantize_mul_mat_vec_q3_K_cuda
(
src0_dd_i
,
src1_ddf_i
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q4_K
:
dequantize_mul_mat_vec_q4_K_cuda
(
src0_dd_i
,
src1_ddf_i
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q5_K
:
dequantize_mul_mat_vec_q5_K_cuda
(
src0_dd_i
,
src1_ddf_i
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_Q6_K
:
dequantize_mul_mat_vec_q6_K_cuda
(
src0_dd_i
,
src1_ddf_i
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
case
GGML_TYPE_F16
:
convert_mul_mat_vec_f16_cuda
(
src0_dd_i
,
src1_dfloat
,
dst_dd_i
,
ne00
,
row_diff
,
stream
);
break
;
default:
GGML_ABORT
(
"fatal error"
);
break
;
}
GGML_UNUSED
(
src1
);
GGML_UNUSED
(
dst
);
GGML_UNUSED
(
src1_ddq_i
);
GGML_UNUSED
(
src1_ncols
);
GGML_UNUSED
(
src1_padded_row_size
);
}
bool
ggml_cuda_dmmv_type_supported
(
ggml_type
src0_type
)
{
return
src0_type
==
GGML_TYPE_Q4_0
||
src0_type
==
GGML_TYPE_Q4_1
||
src0_type
==
GGML_TYPE_Q5_0
||
src0_type
==
GGML_TYPE_Q5_1
||
src0_type
==
GGML_TYPE_Q8_0
||
src0_type
==
GGML_TYPE_Q2_K
||
src0_type
==
GGML_TYPE_Q3_K
||
src0_type
==
GGML_TYPE_Q4_K
||
src0_type
==
GGML_TYPE_Q5_K
||
src0_type
==
GGML_TYPE_Q6_K
||
src0_type
==
GGML_TYPE_F16
;
}
Prev
1
2
3
4
5
6
7
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment