Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
e9e5f61c
Unverified
Commit
e9e5f61c
authored
Apr 25, 2025
by
Jeffrey Morgan
Committed by
GitHub
Apr 24, 2025
Browse files
llama: update to commit 2016f07b (#10352)
parent
11dde418
Changes
46
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
210 additions
and
97 deletions
+210
-97
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+45
-87
ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
+2
-0
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
+0
-4
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+27
-0
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+109
-6
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+27
-0
No files found.
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
View file @
e9e5f61c
...
...
@@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
static
cudaError_t
ggml_cuda_device_malloc
(
void
**
ptr
,
size_t
size
,
int
device
)
{
ggml_cuda_set_device
(
device
);
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
auto
res
=
hipMallocManaged
(
ptr
,
size
);
if
(
res
==
hipSuccess
)
{
// if error we "need" to know why...
CUDA_CHECK
(
hipMemAdvise
(
*
ptr
,
size
,
hipMemAdviseSetCoarseGrain
,
device
));
}
return
res
;
#else
#if !defined(GGML_USE_HIP)
cudaError_t
err
;
if
(
getenv
(
"GGML_CUDA_ENABLE_UNIFIED_MEMORY"
)
!=
nullptr
)
{
err
=
cudaMallocManaged
(
ptr
,
size
);
#if defined(GGML_USE_HIP)
if
(
err
==
hipSuccess
)
{
CUDA_CHECK
(
cudaMemAdvise
(
*
ptr
,
size
,
hipMemAdviseSetCoarseGrain
,
device
));
}
// fall back to cudaMalloc if not supported (e.g. on Windows)
if
(
err
==
hipErrorNotSupported
)
{
static
bool
warned_unsupported
=
false
;
if
(
!
warned_unsupported
)
{
GGML_LOG_WARN
(
"hipMallocManaged unsupported, falling back to hipMalloc.
\n
"
);
warned_unsupported
=
true
;
}
err
=
cudaMalloc
(
ptr
,
size
);
}
#endif // defined(GGML_USE_HIP)
}
else
{
err
=
cudaMalloc
(
ptr
,
size
);
}
return
err
;
#else
return
cudaMalloc
(
ptr
,
size
);
#endif // !defined(GGML_USE_HIP)
#endif
}
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
...
...
@@ -2341,11 +2342,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case
GGML_OP_ARGSORT
:
ggml_cuda_op_argsort
(
ctx
,
dst
);
break
;
#if !defined(GGML_DISABLE_FLASH_ATTN)
case
GGML_OP_FLASH_ATTN_EXT
:
ggml_cuda_flash_attn_ext
(
ctx
,
dst
);
break
;
#endif
case
GGML_OP_CROSS_ENTROPY_LOSS
:
ggml_cuda_cross_entropy_loss
(
ctx
,
dst
);
break
;
...
...
@@ -2477,10 +2476,11 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
#ifdef USE_CUDA_GRAPH
static
bool
check_node_graph_compatibility_and_refresh_copy_ops
(
ggml_backend_cuda_context
*
cuda_ctx
,
ggml_cgraph
*
cgraph
,
std
::
vector
<
void
*>
&
ggml_cuda_cpy_fn_ptrs
,
bool
use_cuda_graph
)
{
bool
use_cuda_graph
)
{
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx
->
cuda_graph
->
updated_kernel_arg
.
clear
();
cuda_ctx
->
cuda_graph
->
cpy_dest_ptrs
.
clear
();
for
(
int
i
=
0
;
i
<
cgraph
->
n_nodes
;
i
++
)
{
ggml_tensor
*
node
=
cgraph
->
nodes
[
i
];
...
...
@@ -2498,7 +2498,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
if
(
node
->
op
==
GGML_OP_MUL_MAT_ID
)
{
use_cuda_graph
=
false
;
// This node type is not supported by CUDA graph capture
#ifndef NDEBUG
GGML_LOG_DEBUG
(
"%s: disabling CUDA graphs due to
mul_mat_id
\n
"
,
__func__
);
GGML_LOG_DEBUG
(
"%s: disabling CUDA graphs due to
unsupported node type
\n
"
,
__func__
);
#endif
}
...
...
@@ -2512,8 +2512,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
}
if
(
node
->
op
==
GGML_OP_CPY
)
{
// store the copy op parameter which changes with each token.
cuda_ctx
->
cuda_graph
->
updated_kernel_arg
.
push_back
((
char
**
)
&
(
node
->
src
[
1
]
->
data
));
// Store the pointers which are updated for each token, such that these can be sent
// to the device and accessed using indirection from CUDA graph
cuda_ctx
->
cuda_graph
->
cpy_dest_ptrs
.
push_back
((
char
*
)
node
->
src
[
1
]
->
data
);
// store a pointer to each copy op CUDA kernel to identify it later
void
*
ptr
=
ggml_cuda_cpy_fn
(
node
->
src
[
0
],
node
->
src
[
1
]);
if
(
!
ptr
)
{
...
...
@@ -2521,10 +2524,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
#ifndef NDEBUG
GGML_LOG_DEBUG
(
"%s: disabling CUDA graphs due to unsupported copy op
\n
"
,
__func__
);
#endif
}
else
{
if
(
std
::
find
(
ggml_cuda_cpy_fn_ptrs
.
begin
(),
ggml_cuda_cpy_fn_ptrs
.
end
(),
ptr
)
==
ggml_cuda_cpy_fn_ptrs
.
end
())
{
ggml_cuda_cpy_fn_ptrs
.
push_back
(
ptr
);
}
}
}
...
...
@@ -2533,6 +2532,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
}
}
if
(
use_cuda_graph
)
{
cuda_ctx
->
cuda_graph
->
use_cpy_indirection
=
true
;
// copy pointers to GPU so they can be accessed via indirection within CUDA graph
ggml_cuda_cpy_dest_ptrs_copy
(
cuda_ctx
->
cuda_graph
.
get
(),
cuda_ctx
->
cuda_graph
->
cpy_dest_ptrs
.
data
(),
cuda_ctx
->
cuda_graph
->
cpy_dest_ptrs
.
size
(),
cuda_ctx
->
stream
());
}
return
use_cuda_graph
;
}
...
...
@@ -2587,51 +2592,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
return
true
;
}
static
void
maintain_cuda_graph
(
ggml_backend_cuda_context
*
cuda_ctx
,
std
::
vector
<
void
*>
&
ggml_cuda_cpy_fn_ptrs
,
bool
cuda_graph_update_required
)
{
if
(
cuda_graph_update_required
)
{
// Extract nodes from graph
// First call with null argument gets number of nodes in graph
CUDA_CHECK
(
cudaGraphGetNodes
(
cuda_ctx
->
cuda_graph
->
graph
,
nullptr
,
&
cuda_ctx
->
cuda_graph
->
num_nodes
));
// Subsequent call with non-null argument gets nodes
cuda_ctx
->
cuda_graph
->
nodes
.
clear
();
cuda_ctx
->
cuda_graph
->
nodes
.
resize
(
cuda_ctx
->
cuda_graph
->
num_nodes
);
cuda_ctx
->
cuda_graph
->
params
.
clear
();
cuda_ctx
->
cuda_graph
->
params
.
resize
(
cuda_ctx
->
cuda_graph
->
num_nodes
);
if
(
cuda_ctx
->
cuda_graph
->
num_nodes
>
0
)
{
CUDA_CHECK
(
cudaGraphGetNodes
(
cuda_ctx
->
cuda_graph
->
graph
,
cuda_ctx
->
cuda_graph
->
nodes
.
data
(),
&
cuda_ctx
->
cuda_graph
->
num_nodes
));
// Loop over nodes, and extract kernel parameters from each node
for
(
size_t
i
=
0
;
i
<
cuda_ctx
->
cuda_graph
->
num_nodes
;
i
++
)
{
cudaGraphNodeType
node_type
;
CUDA_CHECK
(
cudaGraphNodeGetType
(
cuda_ctx
->
cuda_graph
->
nodes
[
i
],
&
node_type
));
if
(
node_type
==
cudaGraphNodeTypeKernel
)
{
cudaError_t
stat
=
cudaGraphKernelNodeGetParams
(
cuda_ctx
->
cuda_graph
->
nodes
[
i
],
&
cuda_ctx
->
cuda_graph
->
params
[
i
]);
// Get params using runtime
if
(
stat
==
cudaErrorInvalidDeviceFunction
)
{
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
// We don't need to update blas nodes, so clear error and move on.
(
void
)
cudaGetLastError
();
}
else
{
GGML_ASSERT
(
stat
==
cudaSuccess
);
}
}
}
}
}
else
{
// One of the arguments to the copy kernel is updated for each token, hence we need to
// replace that argument with the updated value in the CUDA graph
// on update steps, the live parameters will already be captured
int
k
=
0
;
for
(
size_t
i
=
0
;
i
<
cuda_ctx
->
cuda_graph
->
num_nodes
;
i
++
)
{
if
(
count
(
ggml_cuda_cpy_fn_ptrs
.
begin
(),
ggml_cuda_cpy_fn_ptrs
.
end
(),
cuda_ctx
->
cuda_graph
->
params
[
i
].
func
)
>
0
)
{
char
**
updated_kernel_arg_ptr
=
cuda_ctx
->
cuda_graph
->
updated_kernel_arg
.
at
(
k
++
);
*
(
void
**
)
cuda_ctx
->
cuda_graph
->
params
[
i
].
kernelParams
[
1
]
=
*
(
void
**
)
updated_kernel_arg_ptr
;
CUDA_CHECK
(
cudaGraphKernelNodeSetParams
(
cuda_ctx
->
cuda_graph
->
nodes
[
i
],
&
cuda_ctx
->
cuda_graph
->
params
[
i
]));
}
}
}
}
static
bool
is_cuda_graph_update_required
(
ggml_backend_cuda_context
*
cuda_ctx
,
ggml_cgraph
*
cgraph
)
{
bool
cuda_graph_update_required
=
false
;
...
...
@@ -2691,8 +2651,7 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
#endif
static
void
evaluate_and_capture_cuda_graph
(
ggml_backend_cuda_context
*
cuda_ctx
,
ggml_cgraph
*
cgraph
,
[[
maybe_unused
]]
std
::
vector
<
void
*>
&
ggml_cuda_cpy_fn_ptrs
,
bool
&
graph_evaluated_or_captured
,
bool
&
use_cuda_graph
,
bool
&
cuda_graph_update_required
)
{
bool
&
graph_evaluated_or_captured
,
bool
&
use_cuda_graph
,
bool
&
cuda_graph_update_required
)
{
while
(
!
graph_evaluated_or_captured
)
{
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
...
...
@@ -2742,13 +2701,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
if
(
cuda_ctx
->
cuda_graph
->
instance
==
nullptr
)
{
// Create executable graph from captured graph.
CUDA_CHECK
(
cudaGraphInstantiate
(
&
cuda_ctx
->
cuda_graph
->
instance
,
cuda_ctx
->
cuda_graph
->
graph
,
NULL
,
NULL
,
0
));
}
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
maintain_cuda_graph
(
cuda_ctx
,
ggml_cuda_cpy_fn_ptrs
,
cuda_graph_update_required
);
// Update graph executable
update_cuda_graph_executable
(
cuda_ctx
);
if
(
cuda_graph_update_required
)
{
// Update graph executable
update_cuda_graph_executable
(
cuda_ctx
);
}
// Launch graph
CUDA_CHECK
(
cudaGraphLaunch
(
cuda_ctx
->
cuda_graph
->
instance
,
cuda_ctx
->
stream
()));
#else
...
...
@@ -2762,10 +2717,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
ggml_cuda_set_device
(
cuda_ctx
->
device
);
// vector of pointers to CUDA cpy kernels, which are required to identify
// kernel parameters which need updated in the graph for each token
std
::
vector
<
void
*>
ggml_cuda_cpy_fn_ptrs
;
#ifdef USE_CUDA_GRAPH
static
const
bool
disable_cuda_graphs_due_to_env
=
(
getenv
(
"GGML_CUDA_DISABLE_GRAPHS"
)
!=
nullptr
);
...
...
@@ -2799,8 +2750,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if
(
use_cuda_graph
)
{
cuda_graph_update_required
=
is_cuda_graph_update_required
(
cuda_ctx
,
cgraph
);
use_cuda_graph
=
check_node_graph_compatibility_and_refresh_copy_ops
(
cuda_ctx
,
cgraph
,
ggml_cuda_cpy_fn_ptrs
,
use_cuda_graph
);
use_cuda_graph
=
check_node_graph_compatibility_and_refresh_copy_ops
(
cuda_ctx
,
cgraph
,
use_cuda_graph
);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if
(
use_cuda_graph
&&
cuda_graph_update_required
)
{
...
...
@@ -2821,6 +2771,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
CUDA_CHECK
(
cudaStreamBeginCapture
(
cuda_ctx
->
stream
(),
cudaStreamCaptureModeRelaxed
));
}
if
(
!
use_cuda_graph
)
{
cuda_ctx
->
cuda_graph
->
use_cpy_indirection
=
false
;
}
#else
bool
use_cuda_graph
=
false
;
bool
cuda_graph_update_required
=
false
;
...
...
@@ -2828,7 +2782,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
bool
graph_evaluated_or_captured
=
false
;
evaluate_and_capture_cuda_graph
(
cuda_ctx
,
cgraph
,
ggml_cuda_cpy_fn_ptrs
,
graph_evaluated_or_captured
,
use_cuda_graph
,
cuda_graph_update_required
);
evaluate_and_capture_cuda_graph
(
cuda_ctx
,
cgraph
,
graph_evaluated_or_captured
,
use_cuda_graph
,
cuda_graph_update_required
);
return
GGML_STATUS_SUCCESS
;
}
...
...
@@ -3290,6 +3244,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if
(
op
->
src
[
0
]
->
ne
[
0
]
==
192
)
{
return
false
;
}
if
(
op
->
src
[
0
]
->
ne
[
0
]
==
576
)
{
// DeepSeek MLA
return
false
;
}
if
(
op
->
src
[
0
]
->
ne
[
3
]
!=
1
)
{
return
false
;
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
View file @
e9e5f61c
...
...
@@ -71,6 +71,8 @@
#define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMallocManaged hipMallocManaged
#define cudaMemAdvise hipMemAdvise
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
...
...
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
View file @
e9e5f61c
...
...
@@ -89,10 +89,6 @@ endif()
add_compile_definitions
(
GGML_USE_HIP
)
if
(
GGML_HIP_UMA
)
add_compile_definitions
(
GGML_HIP_UMA
)
endif
()
if
(
GGML_CUDA_FORCE_MMQ
)
add_compile_definitions
(
GGML_CUDA_FORCE_MMQ
)
endif
()
...
...
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
View file @
e9e5f61c
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
View file @
e9e5f61c
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
View file @
e9e5f61c
This diff is collapsed.
Click to expand it.
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment