Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
7a81daf0
Unverified
Commit
7a81daf0
authored
Dec 14, 2024
by
Jeffrey Morgan
Committed by
GitHub
Dec 14, 2024
Browse files
llama: update vendor code to commit ba1cb19c (#8101)
parent
60f75560
Changes
273
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
97 additions
and
72 deletions
+97
-72
llama/ggml-cuda/clamp.cuh
llama/ggml-cuda/clamp.cuh
+1
-1
llama/ggml-cuda/common.cuh
llama/ggml-cuda/common.cuh
+36
-36
llama/ggml-cuda/concat.cu
llama/ggml-cuda/concat.cu
+40
-15
llama/ggml-cuda/concat.cuh
llama/ggml-cuda/concat.cuh
+1
-1
llama/ggml-cuda/conv-transpose-1d.cu
llama/ggml-cuda/conv-transpose-1d.cu
+1
-1
llama/ggml-cuda/conv-transpose-1d.cuh
llama/ggml-cuda/conv-transpose-1d.cuh
+1
-1
llama/ggml-cuda/convert.cu
llama/ggml-cuda/convert.cu
+4
-4
llama/ggml-cuda/convert.cuh
llama/ggml-cuda/convert.cuh
+1
-1
llama/ggml-cuda/count-equal.cu
llama/ggml-cuda/count-equal.cu
+1
-1
llama/ggml-cuda/count-equal.cuh
llama/ggml-cuda/count-equal.cuh
+1
-1
llama/ggml-cuda/cpy.cu
llama/ggml-cuda/cpy.cu
+1
-1
llama/ggml-cuda/cpy.cuh
llama/ggml-cuda/cpy.cuh
+1
-1
llama/ggml-cuda/cross-entropy-loss.cu
llama/ggml-cuda/cross-entropy-loss.cu
+1
-1
llama/ggml-cuda/cross-entropy-loss.cuh
llama/ggml-cuda/cross-entropy-loss.cuh
+1
-1
llama/ggml-cuda/dequantize.cuh
llama/ggml-cuda/dequantize.cuh
+1
-1
llama/ggml-cuda/diagmask.cu
llama/ggml-cuda/diagmask.cu
+1
-1
llama/ggml-cuda/diagmask.cuh
llama/ggml-cuda/diagmask.cuh
+1
-1
llama/ggml-cuda/fattn-common.cuh
llama/ggml-cuda/fattn-common.cuh
+1
-1
llama/ggml-cuda/fattn-tile-f16.cu
llama/ggml-cuda/fattn-tile-f16.cu
+1
-1
llama/ggml-cuda/fattn-tile-f16.cuh
llama/ggml-cuda/fattn-tile-f16.cuh
+1
-1
No files found.
llama/ggml-cuda/clamp.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/common.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -67,28 +67,28 @@
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
#define CC_PASCAL 600
#define
MIN
_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA 700
#define CC_TURING 750
#define CC_AMPERE 800
#define CC_OFFSET_AMD 1000000
#define
GGML_CUDA_
CC_PASCAL 600
#define
GGML_CUDA
_CC_DP4A
610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define
GGML_CUDA_
CC_VOLTA 700
#define
GGML_CUDA_
CC_TURING 750
#define
GGML_CUDA_
CC_AMPERE 800
#define
GGML_CUDA_
CC_OFFSET_AMD 1000000
// GCN/CNDA, wave size is 64
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
#define
GGML_CUDA_
CC_GCN4 (
GGML_CUDA_
CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define
GGML_CUDA_
CC_VEGA (
GGML_CUDA_
CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define
GGML_CUDA_
CC_VEGA20 (
GGML_CUDA_
CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define
GGML_CUDA_
CC_CDNA (
GGML_CUDA_
CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define
GGML_CUDA_
CC_CDNA2 (
GGML_CUDA_
CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define
GGML_CUDA_
CC_CDNA3 (
GGML_CUDA_
CC_OFFSET_AMD + 942) // MI300
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define
GGML_CUDA_
CC_RDNA1 (
GGML_CUDA_
CC_OFFSET_AMD + 1010) // RX 5000
#define
GGML_CUDA_
CC_RDNA2 (
GGML_CUDA_
CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define
GGML_CUDA_
CC_RDNA3 (
GGML_CUDA_
CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define CC_QY1 210
#define CC_QY2 220
#define
GGML_CUDA_
CC_QY1 210
#define
GGML_CUDA_
CC_QY2 220
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
...
...
@@ -157,36 +157,36 @@ typedef float dfloat; // dequantize float
typedef
float2
dfloat2
;
#endif // GGML_CUDA_F16
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >=
GGML_CUDA_
CC_PASCAL
#define FP16_AVAILABLE
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >=
GGML_CUDA_
CC_PASCAL
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#define FAST_FP16_AVAILABLE
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
GGML_CUDA_
CC_VOLTA
#define FP16_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
GGML_CUDA_
CC_VOLTA
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
GGML_CUDA_
CC_TURING
#define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
GGML_CUDA_
CC_TURING
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <=
GGML_CUDA_
CC_QY1)
#define FLASH_ATTN_AVAILABLE
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <=
GGML_CUDA_
CC_QY1)
static
constexpr
bool
fast_fp16_available
(
const
int
cc
)
{
return
cc
>=
CC_PASCAL
&&
cc
!=
610
;
return
cc
>=
GGML_CUDA_
CC_PASCAL
&&
cc
!=
610
;
}
static
constexpr
bool
fp16_mma_available
(
const
int
cc
)
{
return
cc
<
CC_OFFSET_AMD
&&
cc
>=
CC_VOLTA
;
return
cc
<
GGML_CUDA_
CC_OFFSET_AMD
&&
cc
>=
GGML_CUDA_
CC_VOLTA
;
}
static
constexpr
bool
int8_mma_available
(
const
int
cc
)
{
return
cc
<
CC_OFFSET_AMD
&&
cc
>=
CC_TURING
;
return
cc
<
GGML_CUDA_
CC_OFFSET_AMD
&&
cc
>=
GGML_CUDA_
CC_TURING
;
}
[[
noreturn
]]
...
...
@@ -213,7 +213,7 @@ static __device__ void no_device_code(
#endif // __CUDA_ARCH__
static
__device__
__forceinline__
int
warp_reduce_sum
(
int
x
)
{
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
GGML_CUDA_
CC_AMPERE
return
__reduce_add_sync
(
0xffffffff
,
x
);
#else
#pragma unroll
...
...
@@ -221,7 +221,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
x
+=
__shfl_xor_sync
(
0xffffffff
,
x
,
offset
,
32
);
}
return
x
;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
GGML_CUDA_
CC_AMPERE
}
static
__device__
__forceinline__
float
warp_reduce_sum
(
float
x
)
{
...
...
@@ -310,7 +310,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
}
static
__device__
__forceinline__
half2
warp_reduce_max
(
half2
x
)
{
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
GGML_CUDA_
CC_PASCAL
#pragma unroll
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
x
=
ggml_cuda_hmax2
(
x
,
__shfl_xor_sync
(
0xffffffff
,
x
,
offset
,
32
));
...
...
@@ -319,7 +319,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#else
GGML_UNUSED
(
x
);
NO_DEVICE_CODE
;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
GGML_CUDA_
CC_PASCAL
}
#if CUDART_VERSION < CUDART_HMASK
...
...
@@ -359,13 +359,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >=
MIN
_CC_DP4A
#if __CUDA_ARCH__ >=
GGML_CUDA
_CC_DP4A
return
__dp4a
(
a
,
b
,
c
);
#else // __CUDA_ARCH__ >=
MIN
_CC_DP4A
#else // __CUDA_ARCH__ >=
GGML_CUDA
_CC_DP4A
const
int8_t
*
a8
=
(
const
int8_t
*
)
&
a
;
const
int8_t
*
b8
=
(
const
int8_t
*
)
&
b
;
return
c
+
a8
[
0
]
*
b8
[
0
]
+
a8
[
1
]
*
b8
[
1
]
+
a8
[
2
]
*
b8
[
2
]
+
a8
[
3
]
*
b8
[
3
];
#endif // __CUDA_ARCH__ >=
MIN
_CC_DP4A
#endif // __CUDA_ARCH__ >=
GGML_CUDA
_CC_DP4A
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
}
...
...
llama/ggml-cuda/concat.cu
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -120,7 +120,9 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
}
// non-contiguous kernel (slow)
static
__global__
void
concat_f32_non_cont
(
template
<
int
dim
>
static
__global__
void
__launch_bounds__
(
CUDA_CONCAT_BLOCK_SIZE
)
concat_f32_non_cont
(
const
char
*
src0
,
const
char
*
src1
,
char
*
dst
,
...
...
@@ -147,22 +149,28 @@ static __global__ void concat_f32_non_cont(
uint64_t
nb0
,
uint64_t
nb1
,
uint64_t
nb2
,
uint64_t
nb3
,
int32_t
dim
)
{
uint64_t
nb3
){
static_assert
(
dim
>=
0
&&
dim
<=
3
,
"dim must be between 0 and 3"
);
const
int64_t
i3
=
blockIdx
.
z
;
const
int64_t
i2
=
blockIdx
.
y
;
const
int64_t
i1
=
blockIdx
.
x
;
int64_t
o
[
4
]
=
{
0
,
0
,
0
,
0
};
o
[
dim
]
=
dim
==
0
?
ne00
:
(
dim
==
1
?
ne01
:
(
dim
==
2
?
ne02
:
ne03
));
const
float
*
x
;
for
(
int
i0
=
threadIdx
.
x
;
i0
<
ne0
;
i0
+=
blockDim
.
x
)
{
for
(
int
64_t
i0
=
threadIdx
.
x
;
i0
<
ne0
;
i0
+=
blockDim
.
x
)
{
if
(
i0
<
ne00
&&
i1
<
ne01
&&
i2
<
ne02
&&
i3
<
ne03
)
{
x
=
(
const
float
*
)(
src0
+
(
i3
)
*
nb03
+
(
i2
)
*
nb02
+
(
i1
)
*
nb01
+
(
i0
)
*
nb00
);
}
else
{
x
=
(
const
float
*
)(
src1
+
(
i3
-
o
[
3
])
*
nb13
+
(
i2
-
o
[
2
])
*
nb12
+
(
i1
-
o
[
1
])
*
nb11
+
(
i0
-
o
[
0
])
*
nb10
);
if
constexpr
(
dim
==
0
)
{
x
=
(
const
float
*
)
(
src1
+
i3
*
nb13
+
i2
*
nb12
+
i1
*
nb11
+
(
i0
-
ne00
)
*
nb10
);
}
else
if
constexpr
(
dim
==
1
)
{
x
=
(
const
float
*
)
(
src1
+
i3
*
nb13
+
i2
*
nb12
+
(
i1
-
ne01
)
*
nb11
+
i0
*
nb10
);
}
else
if
constexpr
(
dim
==
2
)
{
x
=
(
const
float
*
)
(
src1
+
i3
*
nb13
+
(
i2
-
ne02
)
*
nb12
+
i1
*
nb11
+
i0
*
nb10
);
}
else
if
constexpr
(
dim
==
3
)
{
x
=
(
const
float
*
)
(
src1
+
(
i3
-
ne03
)
*
nb13
+
i2
*
nb12
+
i1
*
nb11
+
i0
*
nb10
);
}
}
float
*
y
=
(
float
*
)(
dst
+
i3
*
nb3
+
i2
*
nb2
+
i1
*
nb1
+
i0
*
nb0
);
...
...
@@ -208,15 +216,32 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
}
}
else
{
dim3
grid_dim
(
dst
->
ne
[
1
],
dst
->
ne
[
2
],
dst
->
ne
[
3
]);
concat_f32_non_cont
<<<
grid_dim
,
CUDA_CONCAT_BLOCK_SIZE
,
0
,
stream
>>>
(
(
const
char
*
)
src0
->
data
,
(
const
char
*
)
src1
->
data
,
(
char
*
)
dst
->
data
,
auto
launch_kernel
=
[
&
](
auto
dim
)
{
concat_f32_non_cont
<
dim
><<<
grid_dim
,
CUDA_CONCAT_BLOCK_SIZE
,
0
,
stream
>>>
(
(
const
char
*
)
src0
->
data
,
(
const
char
*
)
src1
->
data
,
(
char
*
)
dst
->
data
,
src0
->
ne
[
0
],
src0
->
ne
[
1
],
src0
->
ne
[
2
],
src0
->
ne
[
3
],
src0
->
nb
[
0
],
src0
->
nb
[
1
],
src0
->
nb
[
2
],
src0
->
nb
[
3
],
src1
->
ne
[
0
],
src1
->
ne
[
1
],
src1
->
ne
[
2
],
src1
->
ne
[
3
],
src1
->
nb
[
0
],
src1
->
nb
[
1
],
src1
->
nb
[
2
],
src1
->
nb
[
3
],
dst
->
ne
[
0
],
dst
->
ne
[
1
],
dst
->
ne
[
2
],
dst
->
ne
[
3
],
dst
->
nb
[
0
],
dst
->
nb
[
1
],
dst
->
nb
[
2
],
dst
->
nb
[
3
],
dim
);
dst
->
ne
[
0
],
dst
->
ne
[
1
],
dst
->
ne
[
2
],
dst
->
ne
[
3
],
dst
->
nb
[
0
],
dst
->
nb
[
1
],
dst
->
nb
[
2
],
dst
->
nb
[
3
]);
};
switch
(
dim
)
{
case
0
:
launch_kernel
(
std
::
integral_constant
<
int
,
0
>
{});
break
;
case
1
:
launch_kernel
(
std
::
integral_constant
<
int
,
1
>
{});
break
;
case
2
:
launch_kernel
(
std
::
integral_constant
<
int
,
2
>
{});
break
;
case
3
:
launch_kernel
(
std
::
integral_constant
<
int
,
3
>
{});
break
;
default:
GGML_ABORT
(
"Invalid dim: %d"
,
dim
);
break
;
}
}
}
llama/ggml-cuda/concat.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/conv-transpose-1d.cu
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/conv-transpose-1d.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/convert.cu
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -52,7 +52,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
template
<
bool
need_check
>
static
__global__
void
dequantize_block_q8_0_f16
(
const
void
*
__restrict__
vx
,
half
*
__restrict__
y
,
const
int64_t
k
)
{
#if __CUDA_ARCH__ >= CC_PASCAL
#if __CUDA_ARCH__ >=
GGML_CUDA_
CC_PASCAL
constexpr
int
nint
=
CUDA_Q8_0_NE_ALIGN
/
sizeof
(
int
)
+
WARP_SIZE
;
const
int64_t
i0
=
CUDA_Q8_0_NE_ALIGN
*
blockIdx
.
x
;
...
...
@@ -90,7 +90,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
GGML_UNUSED
(
y
);
GGML_UNUSED
(
k
);
NO_DEVICE_CODE
;
#endif // __CUDA_ARCH__ >= CC_PASCAL
#endif // __CUDA_ARCH__ >=
GGML_CUDA_
CC_PASCAL
}
template
<
typename
dst_t
>
...
...
@@ -625,7 +625,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case
GGML_TYPE_Q5_1
:
return
dequantize_block_cuda
<
QK5_1
,
QR5_1
,
dequantize_q5_1
>
;
case
GGML_TYPE_Q8_0
:
if
(
ggml_cuda_info
().
devices
[
ggml_cuda_get_device
()].
cc
>=
CC_PASCAL
)
{
if
(
ggml_cuda_info
().
devices
[
ggml_cuda_get_device
()].
cc
>=
GGML_CUDA_
CC_PASCAL
)
{
return
dequantize_block_q8_0_f16_cuda
;
}
return
dequantize_block_cuda
<
QK8_0
,
QR8_0
,
dequantize_q8_0
>
;
...
...
llama/ggml-cuda/convert.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/count-equal.cu
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/count-equal.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/cpy.cu
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/cpy.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/cross-entropy-loss.cu
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/cross-entropy-loss.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/dequantize.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/diagmask.cu
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/diagmask.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/fattn-common.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/fattn-tile-f16.cu
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/fattn-tile-f16.cuh
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
Prev
1
2
3
4
5
6
7
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment