- // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
- // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph=false;
#ifndef NDEBUG
GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n",__func__,node->name,node->ne[0],node->ne[1],node->ne[2],node->ne[3]);
#endif
}
if(node->op==GGML_OP_CPY){
// Store the pointers which are updated for each token, such that these can be sent