Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
7a81daf0
Unverified
Commit
7a81daf0
authored
Dec 14, 2024
by
Jeffrey Morgan
Committed by
GitHub
Dec 14, 2024
Browse files
llama: update vendor code to commit ba1cb19c (#8101)
parent
60f75560
Changes
273
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
249 additions
and
24 deletions
+249
-24
llama/ggml-cuda/norm.cuh
llama/ggml-cuda/norm.cuh
+1
-1
llama/ggml-cuda/opt-step-adamw.cu
llama/ggml-cuda/opt-step-adamw.cu
+1
-1
llama/ggml-cuda/opt-step-adamw.cuh
llama/ggml-cuda/opt-step-adamw.cuh
+1
-1
llama/ggml-cuda/out-prod.cu
llama/ggml-cuda/out-prod.cu
+1
-1
llama/ggml-cuda/out-prod.cuh
llama/ggml-cuda/out-prod.cuh
+1
-1
llama/ggml-cuda/pad.cu
llama/ggml-cuda/pad.cu
+1
-1
llama/ggml-cuda/pad.cuh
llama/ggml-cuda/pad.cuh
+1
-1
llama/ggml-cuda/pool2d.cu
llama/ggml-cuda/pool2d.cu
+1
-1
llama/ggml-cuda/pool2d.cuh
llama/ggml-cuda/pool2d.cuh
+1
-1
llama/ggml-cuda/quantize.cu
llama/ggml-cuda/quantize.cu
+1
-1
llama/ggml-cuda/quantize.cuh
llama/ggml-cuda/quantize.cuh
+1
-1
llama/ggml-cuda/rope.cu
llama/ggml-cuda/rope.cu
+230
-3
llama/ggml-cuda/rope.cuh
llama/ggml-cuda/rope.cuh
+1
-1
llama/ggml-cuda/scale.cu
llama/ggml-cuda/scale.cu
+1
-1
llama/ggml-cuda/scale.cuh
llama/ggml-cuda/scale.cuh
+1
-1
llama/ggml-cuda/softmax.cu
llama/ggml-cuda/softmax.cu
+1
-1
llama/ggml-cuda/softmax.cuh
llama/ggml-cuda/softmax.cuh
+1
-1
llama/ggml-cuda/sum.cu
llama/ggml-cuda/sum.cu
+1
-3
llama/ggml-cuda/sum.cuh
llama/ggml-cuda/sum.cuh
+1
-1
llama/ggml-cuda/sumrows.cu
llama/ggml-cuda/sumrows.cu
+1
-1
No files found.
llama/ggml-cuda/norm.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/opt-step-adamw.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/opt-step-adamw.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/out-prod.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/out-prod.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/pad.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/pad.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/pool2d.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/pool2d.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/quantize.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/quantize.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/rope.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -30,6 +30,11 @@ struct rope_corr_dims {
...
@@ -30,6 +30,11 @@ struct rope_corr_dims {
float
v
[
2
];
float
v
[
2
];
};
};
struct
mrope_sections
{
int
v
[
4
];
};
static
__device__
float
rope_yarn_ramp
(
const
float
low
,
const
float
high
,
const
int
i0
)
{
static
__device__
float
rope_yarn_ramp
(
const
float
low
,
const
float
high
,
const
int
i0
)
{
const
float
y
=
(
i0
/
2
-
low
)
/
max
(
0.001
f
,
high
-
low
);
const
float
y
=
(
i0
/
2
-
low
)
/
max
(
0.001
f
,
high
-
low
);
return
1.0
f
-
min
(
1.0
f
,
max
(
0.0
f
,
y
));
return
1.0
f
-
min
(
1.0
f
,
max
(
0.0
f
,
y
));
...
@@ -134,6 +139,105 @@ static __global__ void rope_neox(
...
@@ -134,6 +139,105 @@ static __global__ void rope_neox(
dst
[
i
+
n_dims
/
2
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
dst
[
i
+
n_dims
/
2
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
}
}
template
<
typename
T
,
bool
has_ff
>
static
__global__
void
rope_multi
(
const
T
*
x
,
T
*
dst
,
int
ne0
,
int
ne2
,
int
n_dims
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
float
theta_scale
,
const
float
*
freq_factors
,
mrope_sections
sections
)
{
const
int
i0
=
2
*
(
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
);
if
(
i0
>=
ne0
)
{
return
;
}
const
int
row
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i0
>=
n_dims
)
{
const
int
i
=
row
*
ne0
+
i0
;
dst
[
i
+
0
]
=
x
[
i
+
0
];
dst
[
i
+
1
]
=
x
[
i
+
1
];
return
;
}
const
int
i
=
row
*
ne0
+
i0
/
2
;
const
int
i2
=
row
/
p_delta_rows
;
int
sect_dims
=
sections
.
v
[
0
]
+
sections
.
v
[
1
]
+
sections
.
v
[
2
]
+
sections
.
v
[
3
];
int
sec_w
=
sections
.
v
[
1
]
+
sections
.
v
[
0
];
int
sector
=
(
i0
/
2
)
%
sect_dims
;
float
theta_base
=
0.0
;
if
(
sector
<
sections
.
v
[
0
])
{
theta_base
=
pos
[
i2
]
*
powf
(
theta_scale
,
i0
/
2.0
f
);
}
else
if
(
sector
>=
sections
.
v
[
0
]
&&
sector
<
sec_w
)
{
theta_base
=
pos
[
i2
+
ne2
*
1
]
*
powf
(
theta_scale
,
i0
/
2.0
f
);
}
else
if
(
sector
>=
sec_w
&&
sector
<
sec_w
+
sections
.
v
[
2
])
{
theta_base
=
pos
[
i2
+
ne2
*
2
]
*
powf
(
theta_scale
,
i0
/
2.0
f
);
}
else
if
(
sector
>=
sec_w
+
sections
.
v
[
2
])
{
theta_base
=
pos
[
i2
+
ne2
*
3
]
*
powf
(
theta_scale
,
i0
/
2.0
f
);
}
const
float
freq_factor
=
has_ff
?
freq_factors
[
i0
/
2
]
:
1.0
f
;
float
cos_theta
;
float
sin_theta
;
rope_yarn
(
theta_base
/
freq_factor
,
freq_scale
,
corr_dims
,
i0
,
ext_factor
,
attn_factor
,
&
cos_theta
,
&
sin_theta
);
const
float
x0
=
x
[
i
+
0
];
const
float
x1
=
x
[
i
+
n_dims
/
2
];
dst
[
i
+
0
]
=
x0
*
cos_theta
-
x1
*
sin_theta
;
dst
[
i
+
n_dims
/
2
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
}
template
<
typename
T
,
bool
has_ff
>
static
__global__
void
rope_vision
(
const
T
*
x
,
T
*
dst
,
int
ne0
,
int
ne2
,
int
n_dims
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
float
theta_scale
,
const
float
*
freq_factors
,
mrope_sections
sections
)
{
const
int
i0
=
2
*
(
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
);
if
(
i0
>=
ne0
)
{
return
;
}
const
int
row
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
i
=
row
*
ne0
+
i0
/
2
;
const
int
i2
=
row
/
p_delta_rows
;
// i2-th tokens
int
sect_dims
=
sections
.
v
[
0
]
+
sections
.
v
[
1
];
int
sec_w
=
sections
.
v
[
1
]
+
sections
.
v
[
0
];
int
sector
=
(
i0
/
2
)
%
sect_dims
;
float
theta_base
=
0.0
;
if
(
sector
<
sections
.
v
[
0
])
{
const
int
p
=
sector
;
theta_base
=
pos
[
i2
]
*
powf
(
theta_scale
,
p
);
}
else
if
(
sector
>=
sections
.
v
[
0
]
&&
sector
<
sec_w
)
{
const
int
p
=
sector
-
sections
.
v
[
0
];
theta_base
=
pos
[
i2
+
ne2
]
*
powf
(
theta_scale
,
p
);
}
const
float
freq_factor
=
has_ff
?
freq_factors
[
i0
/
2
]
:
1.0
f
;
float
cos_theta
;
float
sin_theta
;
rope_yarn
(
theta_base
/
freq_factor
,
freq_scale
,
corr_dims
,
i0
,
ext_factor
,
attn_factor
,
&
cos_theta
,
&
sin_theta
);
const
float
x0
=
x
[
i
+
0
];
const
float
x1
=
x
[
i
+
n_dims
];
dst
[
i
+
0
]
=
x0
*
cos_theta
-
x1
*
sin_theta
;
dst
[
i
+
n_dims
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
}
template
<
typename
T
>
template
<
typename
T
>
static
void
rope_norm_cuda
(
static
void
rope_norm_cuda
(
const
T
*
x
,
T
*
dst
,
int
ne0
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
const
T
*
x
,
T
*
dst
,
int
ne0
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
...
@@ -182,6 +286,56 @@ static void rope_neox_cuda(
...
@@ -182,6 +286,56 @@ static void rope_neox_cuda(
}
}
}
}
template
<
typename
T
>
static
void
rope_multi_cuda
(
const
T
*
x
,
T
*
dst
,
int
ne0
,
int
ne2
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
mrope_sections
sections
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ne0
%
2
==
0
);
const
dim3
block_dims
(
1
,
CUDA_ROPE_BLOCK_SIZE
,
1
);
const
int
n_blocks_x
=
(
ne0
+
2
*
CUDA_ROPE_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_ROPE_BLOCK_SIZE
);
const
dim3
block_nums
(
nr
,
n_blocks_x
,
1
);
const
float
theta_scale
=
powf
(
freq_base
,
-
2.0
f
/
n_dims
);
if
(
freq_factors
==
nullptr
)
{
rope_multi
<
T
,
false
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ne0
,
ne2
,
n_dims
,
pos
,
freq_scale
,
p_delta_rows
,
ext_factor
,
attn_factor
,
corr_dims
,
theta_scale
,
freq_factors
,
sections
);
}
else
{
rope_multi
<
T
,
true
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ne0
,
ne2
,
n_dims
,
pos
,
freq_scale
,
p_delta_rows
,
ext_factor
,
attn_factor
,
corr_dims
,
theta_scale
,
freq_factors
,
sections
);
}
}
template
<
typename
T
>
static
void
rope_vision_cuda
(
const
T
*
x
,
T
*
dst
,
int
ne0
,
int
ne2
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
mrope_sections
sections
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ne0
%
2
==
0
);
const
dim3
block_dims
(
1
,
CUDA_ROPE_BLOCK_SIZE
,
1
);
const
int
n_blocks_x
=
(
ne0
+
2
*
CUDA_ROPE_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_ROPE_BLOCK_SIZE
);
const
dim3
block_nums
(
nr
,
n_blocks_x
,
1
);
// break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
// where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
const
float
theta_scale
=
powf
(
freq_base
,
-
2.0
f
/
n_dims
);
if
(
freq_factors
==
nullptr
)
{
rope_vision
<
T
,
false
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ne0
,
ne2
,
n_dims
,
pos
,
freq_scale
,
p_delta_rows
,
ext_factor
,
attn_factor
,
corr_dims
,
theta_scale
,
freq_factors
,
sections
);
}
else
{
rope_vision
<
T
,
true
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ne0
,
ne2
,
n_dims
,
pos
,
freq_scale
,
p_delta_rows
,
ext_factor
,
attn_factor
,
corr_dims
,
theta_scale
,
freq_factors
,
sections
);
}
}
static
void
rope_norm_cuda_f16
(
static
void
rope_norm_cuda_f16
(
const
half
*
x
,
half
*
dst
,
int
ne0
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
const
half
*
x
,
half
*
dst
,
int
ne0
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
cudaStream_t
stream
)
{
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
cudaStream_t
stream
)
{
...
@@ -211,6 +365,38 @@ static void rope_neox_cuda_f32(
...
@@ -211,6 +365,38 @@ static void rope_neox_cuda_f32(
rope_neox_cuda
<
float
>
(
x
,
dst
,
ne0
,
n_dims
,
nr
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
stream
);
rope_neox_cuda
<
float
>
(
x
,
dst
,
ne0
,
n_dims
,
nr
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
stream
);
}
}
static
void
rope_multi_cuda_f16
(
const
half
*
x
,
half
*
dst
,
int
ne0
,
int
ne2
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
mrope_sections
sections
,
cudaStream_t
stream
)
{
rope_multi_cuda
<
half
>
(
x
,
dst
,
ne0
,
ne2
,
n_dims
,
nr
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
sections
,
stream
);
}
static
void
rope_multi_cuda_f32
(
const
float
*
x
,
float
*
dst
,
int
ne0
,
int
ne2
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
mrope_sections
sections
,
cudaStream_t
stream
)
{
rope_multi_cuda
<
float
>
(
x
,
dst
,
ne0
,
ne2
,
n_dims
,
nr
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
sections
,
stream
);
}
static
void
rope_vision_cuda_f16
(
const
half
*
x
,
half
*
dst
,
int
ne0
,
int
ne2
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
mrope_sections
sections
,
cudaStream_t
stream
)
{
rope_vision_cuda
<
half
>
(
x
,
dst
,
ne0
,
ne2
,
n_dims
,
nr
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
sections
,
stream
);
}
static
void
rope_vision_cuda_f32
(
const
float
*
x
,
float
*
dst
,
int
ne0
,
int
ne2
,
int
n_dims
,
int
nr
,
const
int32_t
*
pos
,
float
freq_scale
,
int
p_delta_rows
,
float
freq_base
,
float
ext_factor
,
float
attn_factor
,
rope_corr_dims
corr_dims
,
const
float
*
freq_factors
,
mrope_sections
sections
,
cudaStream_t
stream
)
{
rope_vision_cuda
<
float
>
(
x
,
dst
,
ne0
,
ne2
,
n_dims
,
nr
,
pos
,
freq_scale
,
p_delta_rows
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
sections
,
stream
);
}
void
ggml_cuda_op_rope
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
void
ggml_cuda_op_rope
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
...
@@ -227,8 +413,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
...
@@ -227,8 +413,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
||
dst
->
type
==
GGML_TYPE_F16
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
||
dst
->
type
==
GGML_TYPE_F16
);
GGML_ASSERT
(
src0
->
type
==
dst
->
type
);
GGML_ASSERT
(
src0
->
type
==
dst
->
type
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
ne00
=
src0
->
ne
[
0
];
// head dims
const
int64_t
ne01
=
src0
->
ne
[
1
];
const
int64_t
ne01
=
src0
->
ne
[
1
];
// num heads
const
int64_t
ne02
=
src0
->
ne
[
2
];
// num heads
const
int64_t
nr
=
ggml_nrows
(
src0
);
const
int64_t
nr
=
ggml_nrows
(
src0
);
//const int n_past = ((int32_t *) dst->op_params)[0];
//const int n_past = ((int32_t *) dst->op_params)[0];
...
@@ -236,6 +423,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
...
@@ -236,6 +423,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const
int
mode
=
((
int32_t
*
)
dst
->
op_params
)[
2
];
const
int
mode
=
((
int32_t
*
)
dst
->
op_params
)[
2
];
//const int n_ctx = ((int32_t *) dst->op_params)[3];
//const int n_ctx = ((int32_t *) dst->op_params)[3];
const
int
n_ctx_orig
=
((
int32_t
*
)
dst
->
op_params
)[
4
];
const
int
n_ctx_orig
=
((
int32_t
*
)
dst
->
op_params
)[
4
];
mrope_sections
sections
;
// RoPE alteration for extended context
// RoPE alteration for extended context
float
freq_base
;
float
freq_base
;
...
@@ -251,8 +439,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
...
@@ -251,8 +439,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy
(
&
attn_factor
,
(
int32_t
*
)
dst
->
op_params
+
8
,
sizeof
(
float
));
memcpy
(
&
attn_factor
,
(
int32_t
*
)
dst
->
op_params
+
8
,
sizeof
(
float
));
memcpy
(
&
beta_fast
,
(
int32_t
*
)
dst
->
op_params
+
9
,
sizeof
(
float
));
memcpy
(
&
beta_fast
,
(
int32_t
*
)
dst
->
op_params
+
9
,
sizeof
(
float
));
memcpy
(
&
beta_slow
,
(
int32_t
*
)
dst
->
op_params
+
10
,
sizeof
(
float
));
memcpy
(
&
beta_slow
,
(
int32_t
*
)
dst
->
op_params
+
10
,
sizeof
(
float
));
memcpy
(
&
sections
.
v
,
(
int32_t
*
)
dst
->
op_params
+
11
,
sizeof
(
int
)
*
4
);
const
bool
is_neox
=
mode
&
GGML_ROPE_TYPE_NEOX
;
const
bool
is_neox
=
mode
&
GGML_ROPE_TYPE_NEOX
;
const
bool
is_mrope
=
mode
&
GGML_ROPE_TYPE_MROPE
;
const
bool
is_vision
=
mode
==
GGML_ROPE_TYPE_VISION
;
if
(
is_mrope
)
{
GGML_ASSERT
(
sections
.
v
[
0
]
>
0
||
sections
.
v
[
1
]
>
0
||
sections
.
v
[
2
]
>
0
);
}
if
(
is_vision
)
{
GGML_ASSERT
(
n_dims
==
ne00
/
2
);
}
const
int32_t
*
pos
=
(
const
int32_t
*
)
src1_d
;
const
int32_t
*
pos
=
(
const
int32_t
*
)
src1_d
;
...
@@ -279,6 +478,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
...
@@ -279,6 +478,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
}
else
{
}
else
{
GGML_ABORT
(
"fatal error"
);
GGML_ABORT
(
"fatal error"
);
}
}
}
else
if
(
is_mrope
&&
!
is_vision
)
{
if
(
src0
->
type
==
GGML_TYPE_F32
)
{
rope_multi_cuda_f32
(
(
const
float
*
)
src0_d
,
(
float
*
)
dst_d
,
ne00
,
ne02
,
n_dims
,
nr
,
pos
,
freq_scale
,
ne01
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
sections
,
stream
);
}
else
if
(
src0
->
type
==
GGML_TYPE_F16
)
{
rope_multi_cuda_f16
(
(
const
half
*
)
src0_d
,
(
half
*
)
dst_d
,
ne00
,
ne02
,
n_dims
,
nr
,
pos
,
freq_scale
,
ne01
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
sections
,
stream
);
}
else
{
GGML_ABORT
(
"fatal error"
);
}
}
else
if
(
is_vision
)
{
if
(
src0
->
type
==
GGML_TYPE_F32
)
{
rope_vision_cuda_f32
(
(
const
float
*
)
src0_d
,
(
float
*
)
dst_d
,
ne00
,
ne02
,
n_dims
,
nr
,
pos
,
freq_scale
,
ne01
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
sections
,
stream
);
}
else
if
(
src0
->
type
==
GGML_TYPE_F16
)
{
rope_vision_cuda_f16
(
(
const
half
*
)
src0_d
,
(
half
*
)
dst_d
,
ne00
,
ne02
,
n_dims
,
nr
,
pos
,
freq_scale
,
ne01
,
freq_base
,
ext_factor
,
attn_factor
,
corr_dims
,
freq_factors
,
sections
,
stream
);
}
else
{
GGML_ABORT
(
"fatal error"
);
}
}
else
{
}
else
{
if
(
src0
->
type
==
GGML_TYPE_F32
)
{
if
(
src0
->
type
==
GGML_TYPE_F32
)
{
rope_norm_cuda_f32
(
rope_norm_cuda_f32
(
...
...
llama/ggml-cuda/rope.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/scale.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/scale.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/softmax.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/softmax.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/sum.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
@@ -29,8 +29,6 @@
...
@@ -29,8 +29,6 @@
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#ifdef USE_CUB
#ifdef USE_CUB
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
// For this reason CUB must be included BEFORE anything else.
#include <cub/cub.cuh>
#include <cub/cub.cuh>
using
namespace
cub
;
using
namespace
cub
;
#endif // USE_CUB
#endif // USE_CUB
...
...
llama/ggml-cuda/sum.cuh
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda/sumrows.cu
View file @
7a81daf0
/**
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
*
* MIT License
* MIT License
*
*
...
...
Prev
1
2
3
4
5
6
7
8
9
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment