Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
527cc978
"vscode:/vscode.git/clone" did not exist on "ae5c0fc442716e9fdc6fddba33c970ab3fe6f208"
Unverified
Commit
527cc978
authored
Dec 10, 2024
by
Jeffrey Morgan
Committed by
GitHub
Dec 10, 2024
Browse files
llama: update vendored code to commit 40c6d79f (#7875)
parent
a37f4a86
Changes
288
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
62 additions
and
64 deletions
+62
-64
llama/ggml-cuda/mmvq.cu
llama/ggml-cuda/mmvq.cu
+6
-6
llama/ggml-cuda/mmvq.cuh
llama/ggml-cuda/mmvq.cuh
+1
-1
llama/ggml-cuda/norm.cu
llama/ggml-cuda/norm.cu
+1
-1
llama/ggml-cuda/norm.cuh
llama/ggml-cuda/norm.cuh
+1
-1
llama/ggml-cuda/opt-step-adamw.cu
llama/ggml-cuda/opt-step-adamw.cu
+34
-36
llama/ggml-cuda/opt-step-adamw.cuh
llama/ggml-cuda/opt-step-adamw.cuh
+1
-1
llama/ggml-cuda/out-prod.cu
llama/ggml-cuda/out-prod.cu
+1
-1
llama/ggml-cuda/out-prod.cuh
llama/ggml-cuda/out-prod.cuh
+1
-1
llama/ggml-cuda/pad.cu
llama/ggml-cuda/pad.cu
+1
-1
llama/ggml-cuda/pad.cuh
llama/ggml-cuda/pad.cuh
+1
-1
llama/ggml-cuda/pool2d.cu
llama/ggml-cuda/pool2d.cu
+1
-1
llama/ggml-cuda/pool2d.cuh
llama/ggml-cuda/pool2d.cuh
+1
-1
llama/ggml-cuda/quantize.cu
llama/ggml-cuda/quantize.cu
+5
-5
llama/ggml-cuda/quantize.cuh
llama/ggml-cuda/quantize.cuh
+1
-1
llama/ggml-cuda/rope.cu
llama/ggml-cuda/rope.cu
+1
-1
llama/ggml-cuda/rope.cuh
llama/ggml-cuda/rope.cuh
+1
-1
llama/ggml-cuda/scale.cu
llama/ggml-cuda/scale.cu
+1
-1
llama/ggml-cuda/scale.cuh
llama/ggml-cuda/scale.cuh
+1
-1
llama/ggml-cuda/softmax.cu
llama/ggml-cuda/softmax.cu
+1
-1
llama/ggml-cuda/softmax.cuh
llama/ggml-cuda/softmax.cuh
+1
-1
No files found.
llama/ggml-cuda/mmvq.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -74,10 +74,10 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
}
template
<
ggml_type
type
,
int
ncols_y
>
#if !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__))
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__
((
ncols_y
<=
4
?
4
:
2
)
*
WARP_SIZE
,
1
)
#endif // !(defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__))
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static
__global__
void
mul_mat_vec_q
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
nrows_y
,
const
int
nrows_dst
)
{
...
...
@@ -88,13 +88,13 @@ static __global__ void mul_mat_vec_q(
constexpr
vec_dot_q_cuda_t
vec_dot_q_cuda
=
get_vec_dot_q_cuda
(
type
);
#if defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
constexpr
int
nwarps
=
1
;
constexpr
int
rows_per_cuda_block
=
1
;
#else
constexpr
int
nwarps
=
ncols_y
<=
4
?
4
:
2
;
constexpr
int
rows_per_cuda_block
=
ncols_y
==
1
?
1
:
2
;
#endif // defined(GGML_USE_HIP
BLAS
) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
const
int
tid
=
WARP_SIZE
*
threadIdx
.
y
+
threadIdx
.
x
;
const
int
row0
=
rows_per_cuda_block
*
blockIdx
.
x
;
...
...
@@ -168,7 +168,7 @@ static void mul_mat_vec_q_cuda(
int64_t
nwarps
=
1
;
int64_t
rows_per_cuda_block
=
1
;
if
(
ggml_cuda_info
().
devices
[
id
].
cc
<
CC_RDNA
2
)
{
// NVIDIA and AMD older than RDNA2
if
(
ggml_cuda_info
().
devices
[
id
].
cc
<
CC_CDNA
||
ggml_cuda_info
().
devices
[
id
].
cc
==
CC_RDNA
1
)
{
// NVIDIA and AMD older than RDNA2
but not CDNA
switch
(
ncols_y
)
{
case
1
:
nwarps
=
4
;
...
...
llama/ggml-cuda/mmvq.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/norm.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/norm.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/opt-step-adamw.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -24,14 +24,14 @@
* SOFTWARE.
*/
#include "ggml-impl.h"
#include "opt-step-adamw.cuh"
#include <cstdint>
static
__global__
void
opt_step_adamw_f32
(
float
*
__restrict__
x
,
const
float
*
__restrict__
g
,
float
*
__restrict__
g_m
,
float
*
__restrict__
g_v
,
const
int64_t
k
,
const
float
alpha
,
const
float
beta1
,
const
float
beta2
,
const
float
eps
,
const
float
wd
,
const
float
beta1h
,
const
float
beta2h
)
{
float
*
__restrict__
x
,
const
float
*
__restrict__
g
,
float
*
__restrict__
g_m
,
float
*
__restrict__
g_v
,
const
float
*
__restrict__
pars
,
const
int64_t
k
)
{
const
int64_t
i
=
(
int64_t
)
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
...
@@ -39,6 +39,14 @@ static __global__ void opt_step_adamw_f32(
return
;
}
const
float
alpha
=
pars
[
0
];
const
float
beta1
=
pars
[
1
];
const
float
beta2
=
pars
[
2
];
const
float
eps
=
pars
[
3
];
const
float
wd
=
pars
[
4
];
const
float
beta1h
=
pars
[
5
];
const
float
beta2h
=
pars
[
6
];
const
float
gi
=
g
[
i
];
const
float
gmi
=
g_m
[
i
]
*
beta1
+
gi
*
(
1.0
f
-
beta1
);
const
float
gvi
=
g_v
[
i
]
*
beta2
+
gi
*
gi
*
(
1.0
f
-
beta2
);
...
...
@@ -49,58 +57,48 @@ static __global__ void opt_step_adamw_f32(
const
float
mh
=
gmi
*
beta1h
;
const
float
vh
=
sqrtf
(
gvi
*
beta2h
)
+
eps
;
x
[
i
]
=
x
[
i
]
*
(
1.0
f
-
alpha
*
wd
)
-
mh
/
vh
;
x
[
i
]
=
x
[
i
]
*
(
1.0
f
-
alpha
*
wd
)
-
alpha
*
mh
/
vh
;
}
static
void
opt_step_adamw_f32_cuda
(
float
*
x
,
const
float
*
g
,
float
*
g_m
,
float
*
g_v
,
const
int64_t
k
,
const
float
alpha
,
const
float
beta1
,
const
float
beta2
,
const
float
eps
,
const
float
wd
,
const
float
beta1h
,
const
float
beta2h
,
cudaStream_t
stream
)
{
float
*
x
,
const
float
*
g
,
float
*
g_m
,
float
*
g_v
,
const
float
*
pars
,
const
int64_t
k
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_OPT_STEP_ADAMW_BLOCK_SIZE
,
1
,
1
);
const
dim3
block_nums
((
k
+
CUDA_OPT_STEP_ADAMW_BLOCK_SIZE
-
1
)
/
CUDA_OPT_STEP_ADAMW_BLOCK_SIZE
,
1
,
1
);
opt_step_adamw_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
g
,
g_m
,
g_v
,
k
,
alpha
,
beta1
,
beta2
,
eps
,
wd
,
beta1h
,
beta2h
);
opt_step_adamw_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
g
,
g_m
,
g_v
,
pars
,
k
);
}
void
ggml_cuda_opt_step_adamw
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src0_grad
=
dst
->
src
[
1
];
const
ggml_tensor
*
src0_grad_m
=
dst
->
src
[
2
];
const
ggml_tensor
*
src0_grad_v
=
dst
->
src
[
3
];
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0_grad
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0_grad_m
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0_grad_v
->
type
==
GGML_TYPE_F32
);
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src0_grad
=
dst
->
src
[
1
];
const
ggml_tensor
*
src0_grad_m
=
dst
->
src
[
2
];
const
ggml_tensor
*
src0_grad_v
=
dst
->
src
[
3
];
const
ggml_tensor
*
adamw_params
=
dst
->
src
[
4
];
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0_grad
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0_grad_m
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0_grad_v
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
adamw_params
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
ggml_is_contiguous
(
src0
));
GGML_ASSERT
(
ggml_is_contiguous
(
src0_grad
));
GGML_ASSERT
(
ggml_is_contiguous
(
src0_grad_m
));
GGML_ASSERT
(
ggml_is_contiguous
(
src0_grad_v
));
GGML_ASSERT
(
ggml_is_contiguous
(
adamw_params
));
GGML_ASSERT
(
ggml_are_same_shape
(
src0
,
src0_grad
));
GGML_ASSERT
(
ggml_are_same_shape
(
src0
,
src0_grad_m
));
GGML_ASSERT
(
ggml_are_same_shape
(
src0
,
src0_grad_v
));
GGML_ASSERT
(
ggml_nelements
(
adamw_params
)
==
7
);
float
*
src0_d
=
(
float
*
)
src0
->
data
;
const
float
*
src0_grad_d
=
(
const
float
*
)
src0_grad
->
data
;
float
*
src0_grad_m_d
=
(
float
*
)
src0_grad_m
->
data
;
float
*
src0_grad_v_d
=
(
float
*
)
src0_grad_v
->
data
;
float
*
src0_d
=
(
float
*
)
src0
->
data
;
const
float
*
src0_grad_d
=
(
const
float
*
)
src0_grad
->
data
;
float
*
src0_grad_m_d
=
(
float
*
)
src0_grad_m
->
data
;
float
*
src0_grad_v_d
=
(
float
*
)
src0_grad_v
->
data
;
const
float
*
adamw_params_d
=
(
const
float
*
)
adamw_params
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
const
int64_t
ne
=
ggml_nelements
(
src0
);
int64_t
iter
;
memcpy
(
&
iter
,
&
dst
->
op_params
[
0
],
sizeof
(
int64_t
));
float
alpha
;
memcpy
(
&
alpha
,
&
dst
->
op_params
[
2
],
sizeof
(
float
));
float
beta1
;
memcpy
(
&
beta1
,
&
dst
->
op_params
[
3
],
sizeof
(
float
));
float
beta2
;
memcpy
(
&
beta2
,
&
dst
->
op_params
[
4
],
sizeof
(
float
));
float
eps
;
memcpy
(
&
eps
,
&
dst
->
op_params
[
5
],
sizeof
(
float
));
float
wd
;
memcpy
(
&
wd
,
&
dst
->
op_params
[
6
],
sizeof
(
float
));
const
float
beta1h
=
alpha
/
(
1.0
f
-
powf
(
beta1
,
iter
));
const
float
beta2h
=
1.0
f
/
(
1.0
f
-
powf
(
beta2
,
iter
));
opt_step_adamw_f32_cuda
(
src0_d
,
src0_grad_d
,
src0_grad_m_d
,
src0_grad_v_d
,
ne
,
alpha
,
beta1
,
beta2
,
eps
,
wd
,
beta1h
,
beta2h
,
stream
);
iter
++
;
memcpy
(
&
dst
->
op_params
[
0
],
&
iter
,
sizeof
(
int64_t
));
opt_step_adamw_f32_cuda
(
src0_d
,
src0_grad_d
,
src0_grad_m_d
,
src0_grad_v_d
,
adamw_params_d
,
ne
,
stream
);
}
llama/ggml-cuda/opt-step-adamw.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/out-prod.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/out-prod.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/pad.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/pad.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/pool2d.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/pool2d.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/quantize.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -95,8 +95,8 @@ static __global__ void quantize_mmq_q8_1(
// Exchange max. abs. value between vals_per_scale/4 threads.
#pragma unroll
for
(
int
mask
=
vals_per_scale
/
8
;
mask
>
0
;
mask
>>=
1
)
{
amax
=
fmaxf
(
amax
,
__shfl_xor_sync
(
0xFFFFFFFF
,
amax
,
mask
,
WARP_SIZE
));
for
(
int
offset
=
vals_per_scale
/
8
;
offset
>
0
;
offset
>>=
1
)
{
amax
=
fmaxf
(
amax
,
__shfl_xor_sync
(
0xFFFFFFFF
,
amax
,
offset
,
WARP_SIZE
));
}
float
sum
;
...
...
@@ -105,8 +105,8 @@ static __global__ void quantize_mmq_q8_1(
// Exchange calculate sum across vals_per_sum/4 threads.
#pragma unroll
for
(
int
mask
=
vals_per_sum
/
8
;
mask
>
0
;
mask
>>=
1
)
{
sum
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
sum
,
mask
,
WARP_SIZE
);
for
(
int
offset
=
vals_per_sum
/
8
;
offset
>
0
;
offset
>>=
1
)
{
sum
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
sum
,
offset
,
WARP_SIZE
);
}
}
...
...
llama/ggml-cuda/quantize.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/rope.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/rope.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/scale.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/scale.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/softmax.cu
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
llama/ggml-cuda/softmax.cuh
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
Prev
1
2
3
4
5
6
7
8
9
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment