Unverified Commit e9e5f61c authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit 2016f07b (#10352)

parent 11dde418
...@@ -96,31 +96,32 @@ int ggml_cuda_get_device() { ...@@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device); ggml_cuda_set_device(device);
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
auto res = hipMallocManaged(ptr, size);
if (res == hipSuccess) {
// if error we "need" to know why...
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
}
return res;
#else
#if !defined(GGML_USE_HIP)
cudaError_t err; cudaError_t err;
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
{ {
err = cudaMallocManaged(ptr, size); err = cudaMallocManaged(ptr, size);
#if defined(GGML_USE_HIP)
if (err == hipSuccess) {
CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
}
// fall back to cudaMalloc if not supported (e.g. on Windows)
if (err == hipErrorNotSupported) {
static bool warned_unsupported = false;
if (!warned_unsupported) {
GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
warned_unsupported = true;
}
err = cudaMalloc(ptr, size);
}
#endif // defined(GGML_USE_HIP)
} }
else else
{ {
err = cudaMalloc(ptr, size); err = cudaMalloc(ptr, size);
} }
return err; return err;
#else
return cudaMalloc(ptr, size);
#endif // !defined(GGML_USE_HIP)
#endif
} }
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
...@@ -2341,11 +2342,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ...@@ -2341,11 +2342,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst); ggml_cuda_op_argsort(ctx, dst);
break; break;
#if !defined(GGML_DISABLE_FLASH_ATTN)
case GGML_OP_FLASH_ATTN_EXT: case GGML_OP_FLASH_ATTN_EXT:
ggml_cuda_flash_attn_ext(ctx, dst); ggml_cuda_flash_attn_ext(ctx, dst);
break; break;
#endif
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
ggml_cuda_cross_entropy_loss(ctx, dst); ggml_cuda_cross_entropy_loss(ctx, dst);
break; break;
...@@ -2477,10 +2476,11 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ...@@ -2477,10 +2476,11 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
#ifdef USE_CUDA_GRAPH #ifdef USE_CUDA_GRAPH
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) { bool use_cuda_graph) {
// Loop over nodes in GGML graph to obtain info needed for CUDA graph // Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->updated_kernel_arg.clear(); cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i]; ggml_tensor * node = cgraph->nodes[i];
...@@ -2498,7 +2498,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud ...@@ -2498,7 +2498,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
if (node->op == GGML_OP_MUL_MAT_ID) { if (node->op == GGML_OP_MUL_MAT_ID) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
#endif #endif
} }
...@@ -2512,8 +2512,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud ...@@ -2512,8 +2512,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
} }
if (node->op == GGML_OP_CPY) { if (node->op == GGML_OP_CPY) {
// store the copy op parameter which changes with each token.
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data)); // Store the pointers which are updated for each token, such that these can be sent
// to the device and accessed using indirection from CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
// store a pointer to each copy op CUDA kernel to identify it later // store a pointer to each copy op CUDA kernel to identify it later
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
if (!ptr) { if (!ptr) {
...@@ -2521,10 +2524,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud ...@@ -2521,10 +2524,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
#endif #endif
} else {
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
}
} }
} }
...@@ -2533,6 +2532,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud ...@@ -2533,6 +2532,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
} }
} }
if (use_cuda_graph) {
cuda_ctx->cuda_graph->use_cpy_indirection = true;
// copy pointers to GPU so they can be accessed via indirection within CUDA graph
ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
}
return use_cuda_graph; return use_cuda_graph;
} }
...@@ -2587,51 +2592,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra ...@@ -2587,51 +2592,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
return true; return true;
} }
static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
if (cuda_graph_update_required) {
// Extract nodes from graph
// First call with null argument gets number of nodes in graph
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
// Subsequent call with non-null argument gets nodes
cuda_ctx->cuda_graph->nodes.clear();
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
cuda_ctx->cuda_graph->params.clear();
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
if (cuda_ctx->cuda_graph->num_nodes > 0) {
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
// Loop over nodes, and extract kernel parameters from each node
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
cudaGraphNodeType node_type;
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
if (node_type == cudaGraphNodeTypeKernel) {
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
if (stat == cudaErrorInvalidDeviceFunction) {
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
// We don't need to update blas nodes, so clear error and move on.
(void)cudaGetLastError();
} else {
GGML_ASSERT(stat == cudaSuccess);
}
}
}
}
} else {
// One of the arguments to the copy kernel is updated for each token, hence we need to
// replace that argument with the updated value in the CUDA graph
// on update steps, the live parameters will already be captured
int k = 0;
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
*(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
}
}
}
}
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) { static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
bool cuda_graph_update_required = false; bool cuda_graph_update_required = false;
...@@ -2691,8 +2651,7 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { ...@@ -2691,8 +2651,7 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
#endif #endif
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
[[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
bool & cuda_graph_update_required) {
while (!graph_evaluated_or_captured) { while (!graph_evaluated_or_captured) {
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
...@@ -2742,13 +2701,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx ...@@ -2742,13 +2701,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph. if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
} }
if (cuda_graph_update_required) { // Update graph executable
// Perform update to graph (if required for this token), and change copy parameter (required for every token) update_cuda_graph_executable(cuda_ctx);
maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required); }
// Update graph executable
update_cuda_graph_executable(cuda_ctx);
// Launch graph // Launch graph
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
#else #else
...@@ -2762,10 +2717,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ...@@ -2762,10 +2717,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_set_device(cuda_ctx->device);
// vector of pointers to CUDA cpy kernels, which are required to identify
// kernel parameters which need updated in the graph for each token
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
#ifdef USE_CUDA_GRAPH #ifdef USE_CUDA_GRAPH
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
...@@ -2799,8 +2750,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ...@@ -2799,8 +2750,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (use_cuda_graph) { if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (use_cuda_graph && cuda_graph_update_required) { if (use_cuda_graph && cuda_graph_update_required) {
...@@ -2821,6 +2771,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ...@@ -2821,6 +2771,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
} }
if (!use_cuda_graph) {
cuda_ctx->cuda_graph->use_cpy_indirection = false;
}
#else #else
bool use_cuda_graph = false; bool use_cuda_graph = false;
bool cuda_graph_update_required = false; bool cuda_graph_update_required = false;
...@@ -2828,7 +2782,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ...@@ -2828,7 +2782,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
bool graph_evaluated_or_captured = false; bool graph_evaluated_or_captured = false;
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
...@@ -3290,6 +3244,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g ...@@ -3290,6 +3244,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (op->src[0]->ne[0] == 192) { if (op->src[0]->ne[0] == 192) {
return false; return false;
} }
if (op->src[0]->ne[0] == 576) {
// DeepSeek MLA
return false;
}
if (op->src[0]->ne[3] != 1) { if (op->src[0]->ne[3] != 1) {
return false; return false;
} }
......
...@@ -71,6 +71,8 @@ ...@@ -71,6 +71,8 @@
#define cudaLaunchHostFunc hipLaunchHostFunc #define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc #define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMallocManaged hipMallocManaged
#define cudaMemAdvise hipMemAdvise
#define cudaMemcpy hipMemcpy #define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
......
...@@ -89,10 +89,6 @@ endif() ...@@ -89,10 +89,6 @@ endif()
add_compile_definitions(GGML_USE_HIP) add_compile_definitions(GGML_USE_HIP)
if (GGML_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA)
endif()
if (GGML_CUDA_FORCE_MMQ) if (GGML_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif() endif()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment