Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
23a7a73f
Commit
23a7a73f
authored
Feb 28, 2025
by
xuxzh1
🎱
Browse files
fix bug
parent
f38c5ca1
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
10 additions
and
10 deletions
+10
-10
llama/ggml-cuda/binbcast.cu
llama/ggml-cuda/binbcast.cu
+1
-1
llama/ggml-cuda/convert.cu
llama/ggml-cuda/convert.cu
+1
-1
llama/ggml-cuda/cpy.cu
llama/ggml-cuda/cpy.cu
+1
-1
llama/ggml-cuda/ggml-cuda.cu
llama/ggml-cuda/ggml-cuda.cu
+1
-1
llama/ggml-cuda/mmvq.cu
llama/ggml-cuda/mmvq.cu
+1
-1
llama/ggml-cuda/quantize.cu
llama/ggml-cuda/quantize.cu
+1
-1
llama/ggml-cuda/rope.cu
llama/ggml-cuda/rope.cu
+1
-1
llama/ggml-cuda/softmax.cu
llama/ggml-cuda/softmax.cu
+1
-1
llama/ggml-cuda/unary.cu
llama/ggml-cuda/unary.cu
+1
-1
make/Makefile.rocm
make/Makefile.rocm
+1
-1
No files found.
llama/ggml-cuda/binbcast.cu
View file @
23a7a73f
...
@@ -49,7 +49,7 @@ static __device__ __forceinline__ float op_div(const float a, const float b) {
...
@@ -49,7 +49,7 @@ static __device__ __forceinline__ float op_div(const float a, const float b) {
}
}
template
<
float
(
*
bin_op
)(
const
float
,
const
float
),
typename
src0_t
,
typename
src1_t
,
typename
dst_t
>
template
<
float
(
*
bin_op
)(
const
float
,
const
float
),
typename
src0_t
,
typename
src1_t
,
typename
dst_t
>
static
__global__
void
k_bin_bcast
(
const
src0_t
*
src0
,
const
src1_t
*
src1
,
dst_t
*
dst
,
static
__global__
__launch_bounds__
(
1024
)
void
k_bin_bcast
(
const
src0_t
*
src0
,
const
src1_t
*
src1
,
dst_t
*
dst
,
int
ne0
,
int
ne1
,
int
ne2
,
int
ne3
,
int
ne0
,
int
ne1
,
int
ne2
,
int
ne3
,
int
ne10
,
int
ne11
,
int
ne12
,
int
ne13
,
int
ne10
,
int
ne11
,
int
ne12
,
int
ne13
,
/*int s0, */
int
s1
,
int
s2
,
int
s3
,
/*int s0, */
int
s1
,
int
s2
,
int
s3
,
...
...
llama/ggml-cuda/convert.cu
View file @
23a7a73f
...
@@ -596,7 +596,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
...
@@ -596,7 +596,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
}
}
template
<
typename
src_t
,
typename
dst_t
>
template
<
typename
src_t
,
typename
dst_t
>
static
__global__
void
convert_unary
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
k
)
{
static
__global__
__launch_bounds__
(
1024
)
void
convert_unary
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
k
)
{
const
int64_t
i
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int64_t
i
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
if
(
i
>=
k
)
{
...
...
llama/ggml-cuda/cpy.cu
View file @
23a7a73f
...
@@ -57,7 +57,7 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
...
@@ -57,7 +57,7 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
}
}
template
<
cpy_kernel_t
cpy_1
>
template
<
cpy_kernel_t
cpy_1
>
static
__global__
void
cpy_f32_f16
(
const
char
*
cx
,
char
*
cdst
,
const
int
ne
,
static
__global__
__launch_bounds__
(
1024
)
void
cpy_f32_f16
(
const
char
*
cx
,
char
*
cdst
,
const
int
ne
,
const
int
ne00
,
const
int
ne01
,
const
int
ne02
,
const
int
nb00
,
const
int
nb01
,
const
int
nb02
,
const
int
ne00
,
const
int
ne01
,
const
int
ne02
,
const
int
nb00
,
const
int
nb01
,
const
int
nb02
,
const
int
nb03
,
const
int
ne10
,
const
int
ne11
,
const
int
ne12
,
const
int
nb10
,
const
int
nb11
,
const
int
nb03
,
const
int
ne10
,
const
int
ne11
,
const
int
ne12
,
const
int
nb10
,
const
int
nb11
,
const
int
nb12
,
const
int
nb13
)
{
const
int
nb12
,
const
int
nb13
)
{
...
...
llama/ggml-cuda/ggml-cuda.cu
View file @
23a7a73f
...
@@ -1581,7 +1581,7 @@ static void ggml_cuda_op_mul_mat(
...
@@ -1581,7 +1581,7 @@ static void ggml_cuda_op_mul_mat(
}
}
}
}
static
__global__
void
k_compute_batched_ptrs
(
static
__global__
__launch_bounds__
(
1024
)
void
k_compute_batched_ptrs
(
const
half
*
src0_as_f16
,
const
half
*
src1_as_f16
,
char
*
dst
,
const
half
*
src0_as_f16
,
const
half
*
src1_as_f16
,
char
*
dst
,
const
void
**
ptrs_src
,
void
**
ptrs_dst
,
const
void
**
ptrs_src
,
void
**
ptrs_dst
,
int64_t
ne12
,
int64_t
ne13
,
int64_t
ne12
,
int64_t
ne13
,
...
...
llama/ggml-cuda/mmvq.cu
View file @
23a7a73f
...
@@ -76,7 +76,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
...
@@ -76,7 +76,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
template
<
ggml_type
type
,
int
ncols_y
>
template
<
ggml_type
type
,
int
ncols_y
>
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
// tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__
((
ncols_y
<=
4
?
4
:
2
)
*
WARP_SIZE
,
1
)
__launch_bounds__
((
ncols_y
<=
4
?
4
:
2
)
*
16
,
1
)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static
__global__
void
mul_mat_vec_q
(
static
__global__
void
mul_mat_vec_q
(
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
const
void
*
__restrict__
vx
,
const
void
*
__restrict__
vy
,
float
*
__restrict__
dst
,
...
...
llama/ggml-cuda/quantize.cu
View file @
23a7a73f
...
@@ -27,7 +27,7 @@
...
@@ -27,7 +27,7 @@
#include "quantize.cuh"
#include "quantize.cuh"
#include <cstdint>
#include <cstdint>
static
__global__
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx
,
const
int64_t
kx0_padded
)
{
static
__global__
__launch_bounds__
(
1024
)
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx
,
const
int64_t
kx0_padded
)
{
const
int64_t
ix0
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int64_t
ix0
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
ix0
>=
kx0_padded
)
{
if
(
ix0
>=
kx0_padded
)
{
...
...
llama/ggml-cuda/rope.cu
View file @
23a7a73f
...
@@ -60,7 +60,7 @@ static __device__ void rope_yarn(
...
@@ -60,7 +60,7 @@ static __device__ void rope_yarn(
}
}
template
<
typename
T
,
bool
has_ff
>
template
<
typename
T
,
bool
has_ff
>
static
__global__
void
rope_norm
(
static
__global__
__launch_bounds__
(
1024
)
void
rope_norm
(
const
T
*
x
,
T
*
dst
,
int
ne0
,
int
n_dims
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
const
T
*
x
,
T
*
dst
,
int
ne0
,
int
n_dims
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
float
theta_scale
,
const
float
*
freq_factors
)
{
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
float
theta_scale
,
const
float
*
freq_factors
)
{
const
int
i0
=
2
*
(
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
);
const
int
i0
=
2
*
(
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
);
...
...
llama/ggml-cuda/softmax.cu
View file @
23a7a73f
...
@@ -38,7 +38,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
...
@@ -38,7 +38,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
}
}
template
<
bool
vals_smem
,
int
ncols_template
,
int
block_size_template
,
typename
T
>
template
<
bool
vals_smem
,
int
ncols_template
,
int
block_size_template
,
typename
T
>
static
__global__
void
soft_max_f32
(
const
float
*
x
,
const
T
*
mask
,
float
*
dst
,
const
int
ncols_par
,
const
int
nrows_y
,
const
float
scale
,
const
float
max_bias
,
const
float
m0
,
const
float
m1
,
uint32_t
n_head_log2
)
{
static
__global__
__launch_bounds__
(
1024
)
void
soft_max_f32
(
const
float
*
x
,
const
T
*
mask
,
float
*
dst
,
const
int
ncols_par
,
const
int
nrows_y
,
const
float
scale
,
const
float
max_bias
,
const
float
m0
,
const
float
m1
,
uint32_t
n_head_log2
)
{
const
int
ncols
=
ncols_template
==
0
?
ncols_par
:
ncols_template
;
const
int
ncols
=
ncols_template
==
0
?
ncols_par
:
ncols_template
;
const
int
tid
=
threadIdx
.
x
;
const
int
tid
=
threadIdx
.
x
;
...
...
llama/ggml-cuda/unary.cu
View file @
23a7a73f
...
@@ -68,7 +68,7 @@ static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
...
@@ -68,7 +68,7 @@ static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
dst
[
i
]
=
x
[
i
]
*
(
1.0
f
/
(
1.0
f
+
expf
(
GELU_QUICK_COEF
*
x
[
i
])));
dst
[
i
]
=
x
[
i
]
*
(
1.0
f
/
(
1.0
f
+
expf
(
GELU_QUICK_COEF
*
x
[
i
])));
}
}
static
__global__
void
silu_f32
(
const
float
*
x
,
float
*
dst
,
const
int
k
)
{
static
__global__
__launch_bounds__
(
1024
)
void
silu_f32
(
const
float
*
x
,
float
*
dst
,
const
int
k
)
{
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
if
(
i
>=
k
)
{
...
...
make/Makefile.rocm
View file @
23a7a73f
...
@@ -76,7 +76,7 @@ GPU_COMPILER_CUFLAGS = \
...
@@ -76,7 +76,7 @@ GPU_COMPILER_CUFLAGS = \
-DGGML_CUDA_MMV_Y
=
1
\
-DGGML_CUDA_MMV_Y
=
1
\
-DGGML_SCHED_MAX_COPIES
=
4
\
-DGGML_SCHED_MAX_COPIES
=
4
\
-DGGML_USE_HIP
\
-DGGML_USE_HIP
\
--gpu-max-threads-per-block
=
1024
\
#
--gpu-max-threads-per-block=1024
\
-DCDNA
\
-DCDNA
\
-DGGML_USE_LLAMAFILE
\
-DGGML_USE_LLAMAFILE
\
-DHIP_FAST_MATH
\
-DHIP_FAST_MATH
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment