ggml update to b7108 (#12992)

* Revert "vulkan: temporary cary of vulkan fixes (#12971)" This reverts commit 3a9e8e9f. * ggml update to b7087 * fix argsort on metal * update to b7108 * fix bakllava regression This model lacks the metadata for the projector type. * update to b7209 * fix TopK perf * only build arm code on arm

ggml update to b7108 (#12992)
* Revert "vulkan: temporary cary of vulkan fixes (#12971)" This reverts commit 3a9e8e9f. * ggml update to b7087 * fix argsort on metal * update to b7108 * fix bakllava regression This model lacks the metadata for the projector type. * update to b7209 * fix TopK perf * only build arm code on arm
0cf7794b · Daniel Hiltgen · GitHub · 854d40ed · 0cf7794b · 0cf7794b
Unverified Commit 0cf7794b authored Dec 03, 2025 by Daniel Hiltgen Committed by GitHub Dec 03, 2025
20 changed files
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp
+#version 450
+#include "generic_head.glsl"
+#include "types.glsl"
+#extension GL_EXT_control_flow_attributes : enable
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+    if (i >= p.KX) {
+        return;
+    }
+    data_d[i] = D_TYPE(-float(data_a[i]));
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp
@@ -61,7 +61,7 @@ void quantize() {
    const uint a_idx = ib * 8 + iqs;
-    vec4 vals = a_idx < p.ne ? data_a[a_idx] : vec4(0.0f);
+    vec4 vals = a_idx < p.ne / 4 ? data_a[a_idx] : vec4(0.0f);
    const vec4 abs_vals = abs(vals);
    // Find absolute max for each block

--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
@@ -3,6 +3,32 @@
 #include "generic_binary_head.glsl"
 #include "types.glsl"
+#if RMS_NORM_ROPE_FUSION
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+// data is passed from rms_norm -> rope through shared memory.
+// rms_norm calls this data_d, rope calls this rope_data_a.
+// Binding 2 is not used
+shared FLOAT_TYPE rope_data_a[1024];
+#define data_d rope_data_a
+layout (binding = 3) readonly buffer R_Y {int rope_data_pos[];};
+layout (binding = 4) readonly buffer R_Z {float rope_data_ff[];};
+layout (binding = 5) writeonly buffer R_D {ROPE_D_TYPE rope_data_d[];};
+layout (binding = 6) readonly buffer R_I {uvec2 rope_data_i[];}; // indices for set_rows
+#include "rope_params.glsl"
+#include "rope_funcs.glsl"
+#define GGML_ROPE_TYPE_NORMAL 0
+#define GGML_ROPE_TYPE_NEOX   2
+#define GGML_ROPE_TYPE_MROPE  8
+#define GGML_ROPE_TYPE_VISION 24
+#endif
 #extension GL_EXT_control_flow_attributes : enable
 #define BLOCK_SIZE 512
@@ -28,8 +54,12 @@ void rms_norm(uint num_iters) {
    uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset();
    uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset();
+#if RMS_NORM_ROPE_FUSION
+    // Per-row offset in shared memory
+    uint32_t d_offset = 0;
+#else
    uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset();
+#endif
    FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp
    [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) {
@@ -79,6 +109,18 @@ void rms_norm(uint num_iters) {
            data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));
        }
    }
+#if RMS_NORM_ROPE_FUSION
+    barrier();
+    rope_params rp = p.rope;
+    uint rope_row = (samp*nchannels + channel)*nrows + row;
+    for (uint t = 2*tid; t < ncols; t += 2*BLOCK_SIZE) {
+        if (rp.rope_mode == GGML_ROPE_TYPE_NEOX) {
+            rope_neox(t, rope_row, rp);
+        } else if (rp.rope_mode == GGML_ROPE_TYPE_NORMAL) {
+            rope_norm(t, rope_row, rp);
+        }
+    }
+#endif
 }
 void main() {

--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
+float rope_yarn_ramp(const float low, const float high, const uint i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+uint rope_a_coord(const uint i0, const uint i01, const uint i02, rope_params p) {
+#if RMS_NORM_ROPE_FUSION
+    // Per-row offset in shared memory
+    const uint ix = i0;
+#else
+    const uint ix = i02*p.nb02 + i01*p.nb01 + i0;
+#endif
+    return ix;
+}
+void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta, rope_params p) {
+    float mscale = p.attn_factor;
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = p.freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (p.ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
+    }
+    // Backprogagation uses inverted rotation
+    if (p.is_back != 0) {
+        theta = -theta;
+    }
+    cos_theta = cos(theta) * mscale;
+    sin_theta = sin(theta) * mscale;
+}
+void rope_norm(const uint i0, const uint i1, rope_params p) {
+    uint ne0 = p.ncols;
+    uint ne1 = p.p_delta_rows;
+    if (i0 >= ne0) {
+        return;
+    }
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
+    const uint i01 = i1 % ne1;
+    const uint i02 = i1 / ne1;
+    uint idst = i1*ne0 + i0;
+    const uint ix = rope_a_coord(i0, i01, i02, p);
+    // Fusion optimization: ROPE + VIEW + SET_ROWS..
+    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
+    if (p.set_rows_stride != 0) {
+        idst = i01*ne0 + i0;
+        idst += rope_data_i[i02].x * p.set_rows_stride;
+    }
+    if (i0 >= p.n_dims) {
+        rope_data_d[idst + 0] = ROPE_D_TYPE(rope_data_a[ix + 0]);
+        rope_data_d[idst + 1] = ROPE_D_TYPE(rope_data_a[ix + 1]);
+        return;
+    }
+    const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f);
+    const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
+    const float x0 = float(rope_data_a[ix + 0]);
+    const float x1 = float(rope_data_a[ix + 1]);
+    rope_data_d[idst + 0] = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
+    rope_data_d[idst + 1] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
+}
+void rope_neox(const uint i0, const uint i1, rope_params p) {
+    uint ne0 = p.ncols;
+    uint ne1 = p.p_delta_rows;
+    if (i0 >= ne0) {
+        return;
+    }
+    const uint i01 = i1 % ne1;
+    const uint i02 = i1 / ne1;
+    uint idst = i1*ne0 + i0/2;
+    const uint ix = rope_a_coord(i0/2, i01, i02, p);
+    // Fusion optimization: ROPE + VIEW + SET_ROWS..
+    // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
+    if (p.set_rows_stride != 0) {
+        idst = i01*ne0 + i0/2;
+        idst += rope_data_i[i02].x * p.set_rows_stride;
+    }
+    if (i0 >= p.n_dims) {
+        rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
+        rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);
+        return;
+    }
+    const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f);
+    const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
+    const float x0 = float(rope_data_a[ix + 0]);
+    const float x1 = float(rope_data_a[ix + p.n_dims/2]);
+    rope_data_d[idst + 0]          = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
+    rope_data_d[idst + p.n_dims/2] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
+}
+void rope_multi(const uint i0, const uint i1, rope_params p) {
+    uint ne0 = p.ncols;
+    uint ne1 = p.p_delta_rows;
+    uint ne2 = p.ne02;
+    if (i0 >= ne0) {
+        return;
+    }
+    const uint i01 = i1 % ne1;
+    const uint i02 = i1 / ne1;
+    const uint idst = i1*ne0 + i0/2;
+    const uint ix = rope_a_coord(i0/2, i01, i02, p);
+    if (i0 >= p.n_dims) {
+        rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
+        rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);
+        return;
+    }
+    const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
+    const int sec_w = p.sections[1] + p.sections[0];
+    const uint sector = (i0 / 2) % sect_dims;
+    float theta_base = 0.0;
+    if (p.is_imrope != 0) {
+        if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
+            theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
+        } else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
+            theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
+        } else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
+            theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
+        //} else {
+        //    theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
+        }
+    } else {
+        if (sector < p.sections[0]) {
+            theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
+        }
+        else if (sector >= p.sections[0] && sector < sec_w) {
+            theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
+        }
+        else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
+            theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
+        }
+        else if (sector >= sec_w + p.sections[2]) {
+            theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
+        }
+    }
+    const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
+    const float x0 = float(rope_data_a[ix + 0]);
+    const float x1 = float(rope_data_a[ix + p.n_dims/2]);
+    rope_data_d[idst + 0]          = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
+    rope_data_d[idst + p.n_dims/2] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
+}
+void rope_vision(const uint i0, const uint i1, rope_params p) {
+    uint ne0 = p.ncols;
+    uint ne1 = p.p_delta_rows;
+    uint ne2 = p.ne02;
+    if (i0 >= ne0) {
+        return;
+    }
+    const uint i01 = i1 % ne1;
+    const uint i02 = i1 / ne1;
+    const uint idst = i1*ne0 + i0/2;
+    const uint ix = rope_a_coord(i0/2, i01, i02, p);
+    const int sect_dims = p.sections[0] + p.sections[1];
+    const int sec_w = p.sections[1] + p.sections[0];
+    const uint sector = (i0 / 2) % sect_dims;
+    float theta_base = 0.0;
+    if (sector < p.sections[0]) {
+        const uint p0 = sector;
+        theta_base = rope_data_pos[i02]*pow(p.theta_scale, p0);
+    }
+    else if (sector >= p.sections[0] && sector < sec_w) {
+        const uint p0 = sector - p.sections[0];
+        theta_base = rope_data_pos[i02 + ne2]*pow(p.theta_scale, p0);
+    }
+    const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
+    const float x0 = float(rope_data_a[ix + 0]);
+    const float x1 = float(rope_data_a[ix + p.n_dims]);
+    rope_data_d[idst + 0]        = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
+    rope_data_d[idst + p.n_dims] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
@@ -3,55 +3,18 @@
 #extension GL_EXT_shader_16bit_storage : require
 #include "rte.glsl"
+#include "rope_params.glsl"
 layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 0) readonly buffer X {A_TYPE rope_data_a[];};
-layout (binding = 1) readonly buffer Y {int data_pos[];};
+layout (binding = 1) readonly buffer Y {int rope_data_pos[];};
-layout (binding = 2) readonly buffer Z {float data_ff[];};
+layout (binding = 2) readonly buffer Z {float rope_data_ff[];};
-layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
+layout (binding = 3) writeonly buffer D {ROPE_D_TYPE rope_data_d[];};
-layout (binding = 4) readonly buffer I {uvec2 data_i[];}; // indices for set_rows
+layout (binding = 4) readonly buffer I {uvec2 rope_data_i[];}; // indices for set_rows
-layout (push_constant) uniform parameter {
-    uint ncols;
-    uint n_dims;
-    float freq_scale;
-    uint p_delta_rows;
-    float freq_base;
-    float ext_factor;
-    float attn_factor;
-    float corr_dims[2];
-    float theta_scale;
-    uint has_ff;
-    uint ne02;
-    uint s1;
-    uint s2;
-    int sections[4];
-    uint is_back;
-    uint set_rows_stride;
-} p;
-float rope_yarn_ramp(const float low, const float high, const uint i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta) {
+layout (push_constant) uniform parameter {
-    float mscale = p.attn_factor;
+    rope_params pc;
-    // Get n-d rotational scaling corrected for extrapolation
+};
-    float theta_interp = p.freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (p.ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
-    }
-    // Backprogagation uses inverted rotation
-    if (p.is_back != 0) {
-        theta = -theta;
-    }
-    cos_theta = cos(theta) * mscale;
-    sin_theta = sin(theta) * mscale;
-}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
 #version 450
 #include "rope_head.glsl"
+#include "rope_funcs.glsl"
 void main() {
    const uint i0 = 2*gl_GlobalInvocationID.y;
-    uint ne0 = p.ncols;
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
-    uint ne1 = p.p_delta_rows;
+    const uint i1 = gl_GlobalInvocationID.x;
-    uint ne2 = p.ne02;
+    rope_multi(i0, i1, pc);
-    if (i0 >= ne0) {
-        return;
-    }
-    const uint row_dst = gl_GlobalInvocationID.x;
-    const uint row_x     = row_dst % ne1;
-    const uint channel_x = row_dst / ne1;
-    const uint idst = row_dst*ne0 + i0/2;
-    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
-    if (i0 >= p.n_dims) {
-        data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
-        data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
-        return;
-    }
-    const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
-    const int sec_w = p.sections[1] + p.sections[0];
-    const uint sector = (i0 / 2) % sect_dims;
-    float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
-    if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
-        theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
-    }
-    else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
-        theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
-    }
-    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
-    const float x0 = float(data_a[ix + 0]);
-    const float x1 = float(data_a[ix + p.n_dims/2]);
-    data_d[idst + 0]          = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[idst + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
 #version 450
 #include "rope_head.glsl"
+#include "rope_funcs.glsl"
 void main() {
    const uint i0 = 2*gl_GlobalInvocationID.y;
-    uint ne0 = p.ncols;
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
-    uint ne1 = p.p_delta_rows;
+    const uint i1 = gl_GlobalInvocationID.x;
+    rope_neox(i0, i1, pc);
-    if (i0 >= ne0) {
-        return;
-    }
-    const uint row_dst = gl_GlobalInvocationID.x;
-    const uint row_x     = row_dst % ne1;
-    const uint channel_x = row_dst / ne1;
-    uint idst = row_dst*ne0 + i0/2;
-    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
-    // Fusion optimization: ROPE + VIEW + SET_ROWS..
-    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
-    if (p.set_rows_stride != 0) {
-        idst = row_x*ne0 + i0/2;
-        idst += data_i[channel_x].x * p.set_rows_stride;
-    }
-    if (i0 >= p.n_dims) {
-        data_d[idst + i0/2 + 0] = D_TYPE(data_a[ix + i0/2 + 0]);
-        data_d[idst + i0/2 + 1] = D_TYPE(data_a[ix + i0/2 + 1]);
-        return;
-    }
-    const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
-    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
-    const float x0 = float(data_a[ix + 0]);
-    const float x1 = float(data_a[ix + p.n_dims/2]);
-    data_d[idst + 0]          = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[idst + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
 #version 450
 #include "rope_head.glsl"
+#include "rope_funcs.glsl"
 void main() {
    const uint i0 = 2*gl_GlobalInvocationID.y;
-    uint ne0 = p.ncols;
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
-    uint ne1 = p.p_delta_rows;
+    const uint i1 = gl_GlobalInvocationID.x;
+    rope_norm(i0, i1, pc);
-    if (i0 >= ne0) {
-        return;
-    }
-    const uint row_dst = gl_GlobalInvocationID.x;
-    const uint row_x     = row_dst % ne1;
-    const uint channel_x = row_dst / ne1;
-    uint idst = row_dst*ne0 + i0;
-    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0;
-    // Fusion optimization: ROPE + VIEW + SET_ROWS..
-    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
-    if (p.set_rows_stride != 0) {
-        idst = row_x*ne0 + i0;
-        idst += data_i[channel_x].x * p.set_rows_stride;
-    }
-    if (i0 >= p.n_dims) {
-        data_d[idst + 0] = D_TYPE(data_a[ix + 0]);
-        data_d[idst + 1] = D_TYPE(data_a[ix + 1]);
-        return;
-    }
-    const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
-    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
-    const float x0 = float(data_a[ix + 0]);
-    const float x1 = float(data_a[ix + 1]);
-    data_d[idst + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[idst + 1] = D_TYPE(x0*sin_theta + x1*cos_theta);
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl
+#if !defined(GGML_ROPE_PARAMS)
+#define GGML_ROPE_PARAMS
+#include "rte.glsl"
+struct rope_params {
+    uint rope_mode;
+    uint ncols;
+    uint n_dims;
+    float freq_scale;
+    uint p_delta_rows;
+    float freq_base;
+    float ext_factor;
+    float attn_factor;
+    float corr_dims[2];
+    float theta_scale;
+    uint has_ff;
+    uint ne02;
+    uint nb01;
+    uint nb02;
+    int sections[4];
+    uint is_imrope;
+    uint is_back;
+    uint set_rows_stride;
+};
+#endif // !defined(GGML_ROPE_PARAMS)
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp
 #version 450
 #include "rope_head.glsl"
+#include "rope_funcs.glsl"
 void main() {
    const uint i0 = 2*gl_GlobalInvocationID.y;
-    uint ne0 = p.ncols;
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
-    uint ne1 = p.p_delta_rows;
+    const uint i1 = gl_GlobalInvocationID.x;
-    uint ne2 = p.ne02;
+    rope_vision(i0, i1, pc);
-    if (i0 >= ne0) {
-        return;
-    }
-    const uint row_dst = gl_GlobalInvocationID.x;
-    const uint row_x     = row_dst % ne1;
-    const uint channel_x = row_dst / ne1;
-    const uint idst = row_dst*ne0 + i0/2;
-    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
-    const int sect_dims = p.sections[0] + p.sections[1];
-    const int sec_w = p.sections[1] + p.sections[0];
-    const uint sector = (i0 / 2) % sect_dims;
-    float theta_base = 0.0;
-    if (sector < p.sections[0]) {
-        const uint p0 = sector;
-        theta_base = data_pos[channel_x]*pow(p.theta_scale, p0);
-    }
-    else if (sector >= p.sections[0] && sector < sec_w) {
-        const uint p0 = sector - p.sections[0];
-        theta_base = data_pos[channel_x + ne2]*pow(p.theta_scale, p0);
-    }
-    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
-    const float x0 = float(data_a[ix + 0]);
-    const float x1 = float(data_a[ix + p.n_dims]);
-    data_d[idst + 0]        = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[idst + p.n_dims] = D_TYPE(x0*sin_theta + x1*cos_theta);
 }
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/round.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/round.comp
+#version 450
+#include "generic_head.glsl"
+#include "types.glsl"
+#extension GL_EXT_control_flow_attributes : enable
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+    if (i >= p.KX) {
+        return;
+    }
+    const float x = float(data_a[i]);
+    float result;
+    // Round halfway cases away from zero as roundf does.
+    if (x >= 0.0) {
+        result = floor(x + 0.5);
+    } else {
+        result = ceil(x - 0.5);
+    }
+    data_d[i] = D_TYPE(result);
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp
+#version 450
+#include "generic_head.glsl"
+#include "types.glsl"
+#extension GL_EXT_control_flow_attributes : enable
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+    if (i >= p.KX) {
+        return;
+    }
+    const float x = float(data_a[i]);
+    const float result = (x > 20.0f) ? x : log(1.0f + exp(x));
+    data_d[i] = D_TYPE(result);
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp
+#version 450
+#include "types.glsl"
+#include "generic_binary_head.glsl"
+layout (constant_id = 1) const uint N = 64;
+layout (constant_id = 2) const uint K = 32;
+layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
+uint a_base, b_base, x_base;
+FLOAT_TYPE get_a(uint r, uint c) {
+    return FLOAT_TYPE(data_a[a_base + r * p.nb01 + c * p.nb00]);
+}
+FLOAT_TYPE get_b(uint r, uint c) {
+    return FLOAT_TYPE(data_b[b_base + r * p.nb11 + c * p.nb10]);
+}
+void store_x(uint r, uint c, FLOAT_TYPE v) {
+    data_d[x_base + r * p.nb21 + c * p.nb20] = D_TYPE(v);
+}
+shared FLOAT_TYPE shA[N * N];
+shared FLOAT_TYPE shB[N * K];
+void main() {
+    const uint batch = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint tid = gl_LocalInvocationID.x;
+    if (batch >= p.ne02 * p.ne03) {
+        return;
+    }
+    const uint i3 = batch / p.ne22;
+    const uint i2 = batch % p.ne22;
+    a_base = get_aoffset() + i2 * p.nb02 + i3 * p.nb03;
+    b_base = get_boffset() + i2 * p.nb12 + i3 * p.nb13;
+    x_base = get_doffset() + i2 * p.nb22 + i3 * p.nb23;
+    // Load the A matrix into shA
+    [[unroll]] for (uint i = 0; i < N * N; i += gl_WorkGroupSize.x) {
+        uint idx = i + tid;
+        if (((N * N) % gl_WorkGroupSize.x == 0) || idx < N * N) {
+            shA[idx] = get_a(idx / N, idx % N);
+        }
+    }
+    // Load the B matrix into shB
+    [[unroll]] for (uint i = 0; i < N * K; i += gl_WorkGroupSize.x) {
+        uint idx = i + tid;
+        if (((N * K) % gl_WorkGroupSize.x == 0) || idx < N * K) {
+            shB[idx] = get_b(idx / K, idx % K);
+        }
+    }
+    barrier();
+    FLOAT_TYPE X[N];
+    // Each thread solves one column
+    if (tid < K) {
+        [[unroll]] for (int r = 0; r < N; ++r) {
+            FLOAT_TYPE b = shB[r * K + tid];
+            // Compute x[r,c] = (b[r,c] - sum(a[r,c]*x[c])) / a[r,r]
+            [[unroll]] for (int c = 0; c < r; ++c) {
+                b -= shA[r * N + c] * X[c];
+            }
+            FLOAT_TYPE x = b / shA[r * N + r];
+            X[r] = x;
+            store_x(r, tid, x);
+        }
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/step.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/step.comp
+#version 450
+#include "generic_head.glsl"
+#include "types.glsl"
+#extension GL_EXT_control_flow_attributes : enable
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+    if (i >= p.KX) {
+        return;
+    }
+    const float x = float(data_a[i]);
+    data_d[i] = D_TYPE(x >= 0.0f ? 1.0f : 0.0f);
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp
 #version 450
 #include "types.glsl"
+#include "sum_rows.glsl"
 #extension GL_EXT_control_flow_attributes : enable
@@ -11,30 +12,6 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
-layout (push_constant) uniform parameter
-{
-    uint n_cols;
-    uint ne01, ne02;
-    uint nb01, nb02, nb03;
-    uint nb11, nb12, nb13;
-    float weight;
-    uint misalign_offsets;
-    uint ne0_12mp, ne0_12L;
-    uint ne0_1mp, ne0_1L;
-} p;
-uint get_aoffset() { return p.misalign_offsets >> 16; }
-uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
-// see init_fastdiv_values in ggml-vulkan.cpp
-uint fastdiv(uint n, uint mp, uint L) {
-    uint msbs, lsbs;
-    // msbs = mulhi(n, mp)
-    umulExtended(n, mp, msbs, lsbs);
-    return (msbs + n) >> L;
-}
 shared FLOAT_TYPE tmp[BLOCK_SIZE];
 void main() {

--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl
+// vk_op_sum_rows_push_constants
+layout (push_constant) uniform parameter
+{
+    uint n_cols;
+    uint ne01, ne02;
+    uint nb01, nb02, nb03;
+    uint nb11, nb12, nb13;
+    float weight;
+    uint misalign_offsets;
+    uint ne0_12mp, ne0_12L;
+    uint ne0_1mp, ne0_1L;
+} p;
+uint get_aoffset() { return p.misalign_offsets >> 16; }
+uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
+// see init_fastdiv_values in ggml-vulkan.cpp
+uint fastdiv(uint n, uint mp, uint L) {
+    uint msbs, lsbs;
+    // msbs = mulhi(n, mp)
+    umulExtended(n, mp, msbs, lsbs);
+    return (msbs + n) >> L;
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp
+#version 450
+#extension GL_EXT_control_flow_attributes : enable
+#include "types.glsl"
+layout(constant_id = 0) const int BLOCK_SIZE = 1024;
+layout(constant_id = 1) const int NCOLS_PADDED_LOG2 = 10;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+// Input can either be the source (A) or intermediate values (S).
+// Similarly, output can be either destination (D) or intermediate values (S).
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 0) readonly buffer S {ivec2 data_s[];};
+layout (binding = 1) writeonly buffer D {int data_d[];};
+layout (binding = 1) writeonly buffer T {ivec2 data_t[];};
+layout (push_constant) uniform parameter {
+    uint orig_ncols;
+    uint ncols_input;
+    uint ncols_output;
+    uint nrows;
+    uint first_pass;
+    uint last_pass;
+} p;
+// pairs of (gid, value)
+shared ivec2 dst_row[BLOCK_SIZE];
+void topk(bool needs_bounds_check, const uint row) {
+    const int col = int(gl_LocalInvocationID.x);
+    // initialize indices
+    if (gl_GlobalInvocationID.x < p.ncols_input) {
+        if (p.first_pass != 0) {
+            const uint row_offset = row * p.ncols_input;
+            dst_row[col] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
+        } else {
+            const uint row_offset = row * p.orig_ncols;
+            dst_row[col] = data_s[row_offset + gl_GlobalInvocationID.x];
+        }
+    } else {
+        dst_row[col] = ivec2(p.orig_ncols, 0);
+    }
+    barrier();
+    if (p.ncols_output == 1) {
+        // Fast path for single output - just do a max reduction
+        [[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
+            if (col < s) {
+                ivec2 a = dst_row[col];
+                ivec2 b = dst_row[col + s];
+                if (a.x >= p.orig_ncols ||
+                    b.x < p.orig_ncols && b.y > a.y) {
+                    dst_row[col] = b;
+                }
+            }
+            barrier();
+        }
+    } else {
+        // bitonic sort on this group of elements
+        uint num_outer_loop_iters = NCOLS_PADDED_LOG2;
+        for (uint k = 2, outer_idx = 0; outer_idx < num_outer_loop_iters; k *= 2, outer_idx++) {
+            uint num_inner_loop_iters = outer_idx + 1;
+            for (uint j = k / 2, inner_idx = 0; inner_idx < num_inner_loop_iters; j /= 2, inner_idx++) {
+                const int ixj = int(col ^ j);
+                int idx_0 = (col & k) == 0 ? col : ixj;
+                int idx_1 = (col & k) == 0 ? ixj : col;
+                ivec2 sh_idx_0 = dst_row[idx_0];
+                ivec2 sh_idx_1 = dst_row[idx_1];
+                bool idx_0_oob = needs_bounds_check ? sh_idx_0.x >= p.orig_ncols : false;
+                bool idx_1_oob = needs_bounds_check ? sh_idx_1.x >= p.orig_ncols : false;
+                if ((idx_0_oob ||
+                    (!idx_1_oob && intBitsToFloat(sh_idx_0.y) < intBitsToFloat(sh_idx_1.y))) && (ixj > col)) {
+                    dst_row[idx_0] = sh_idx_1;
+                    dst_row[idx_1] = sh_idx_0;
+                }
+                barrier();
+            }
+        }
+    }
+    if (col < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
+        if (p.last_pass != 0) {
+            const uint row_offset = row * p.ncols_output;
+            data_d[row_offset + col] = dst_row[col].x;
+        } else {
+            const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
+            data_t[row_offset + col] = dst_row[col];
+        }
+    }
+}
+void main() {
+    // Fast path for fully occupied workgroups
+    if ((p.ncols_input % BLOCK_SIZE) == 0) {
+        uint row = gl_WorkGroupID.y;
+        while (row < p.nrows) {
+            topk(false, row);
+            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
+        }
+    } else {
+        uint row = gl_WorkGroupID.y;
+        while (row < p.nrows) {
+            topk(true, row);
+            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
+        }
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp
+#version 450
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_debug_printf : enable
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_ballot : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#extension GL_KHR_shader_subgroup_shuffle : enable
+#include "types.glsl"
+layout(constant_id = 0) const int BLOCK_SIZE = 1024;
+layout(constant_id = 1) const int SUBGROUP_SIZE = 32;
+layout(constant_id = 2) const int SUBGROUP_SIZE_LOG2 = 5;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+// Input can either be the source (A) or intermediate values (S).
+// Similarly, output can be either destination (D) or intermediate values (S).
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 0) readonly buffer S {ivec2 data_s[];};
+layout (binding = 1) writeonly buffer D {int data_d[];};
+layout (binding = 1) writeonly buffer T {ivec2 data_t[];};
+layout (push_constant) uniform parameter {
+    uint orig_ncols;
+    uint ncols_input;
+    uint ncols_output;
+    uint nrows;
+    uint first_pass;
+    uint last_pass;
+} p;
+// pairs of (gid, value)
+shared ivec2 dst_row[BLOCK_SIZE];
+shared int counts[SUBGROUP_SIZE];
+shared int sh_min_idx;
+shared uint sh_total;
+shared uint offset_partials[BLOCK_SIZE / SUBGROUP_SIZE];
+// Map float values to uint such that comparisons still work.
+// Positive values set the high bit, negative values are inverted.
+// +0.0 -> 0x80000000, -0.0 -> 0x7FFFFFFF are in the correct places.
+uint f2ui(float x) {
+    uint y = floatBitsToUint(x);
+    if ((y & 0x80000000) != 0) {
+        y ^= ~0;
+    } else {
+        y |= 0x80000000;
+    }
+    return y;
+}
+void topk(const uint row) {
+    const int tid = int(gl_LocalInvocationID.x);
+    // initialize indices
+    if (gl_GlobalInvocationID.x < p.ncols_input) {
+        if (p.first_pass != 0) {
+            const uint row_offset = row * p.ncols_input;
+            dst_row[tid] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
+        } else {
+            const uint row_offset = row * p.orig_ncols;
+            dst_row[tid] = data_s[row_offset + gl_GlobalInvocationID.x];
+        }
+    } else {
+        dst_row[tid] = ivec2(p.orig_ncols, 0xFF800000); // -inf
+    }
+    barrier();
+    if (p.ncols_output == 1) {
+        // Fast path for single output - just do a max reduction
+        [[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
+            if (tid < s) {
+                ivec2 a = dst_row[tid];
+                ivec2 b = dst_row[tid + s];
+                if (a.x >= p.orig_ncols ||
+                    b.x < p.orig_ncols && b.y > a.y) {
+                    dst_row[tid] = b;
+                }
+            }
+            barrier();
+        }
+    } else {
+        // Do an N-ary search to find the K-th largest value.
+        // We remap the float values to be comparable as unsigned integers,
+        // and split the range into 2^N smaller ranges where N is the
+        // subgroup size. Count how many values are in each range, if the K-th
+        // largest value is in the middle of one of thee ranges then repeat
+        // and split again.
+        // Mask is the current set of bits we're searching. Shift is the LSB index.
+        int shift = 32 - SUBGROUP_SIZE_LOG2;
+        uint mask = ((1 << SUBGROUP_SIZE_LOG2) - 1) << shift;
+        // The current range.
+        uint range_min = 0;
+        uint range_max = 0xFF800000;
+        // How many are above the current range, and how many we need to find.
+        uint total = 0;
+        uint limit = min(p.ncols_output, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);
+        while (mask != 0) {
+            barrier();
+            // Initialize bucket counts to zero.
+            if (tid < SUBGROUP_SIZE) {
+                counts[tid] = 0;
+            }
+            barrier();
+            // Count how many values are in each bucket.
+            if (tid < p.ncols_input) {
+                float y = intBitsToFloat(dst_row[tid].y);
+                uint fy = f2ui(y);
+                if (fy >= range_min && fy < range_max) {
+                    uint bucket = (fy & mask) >> shift;
+                    atomicAdd(counts[bucket], 1);
+                }
+            }
+            barrier();
+            // On the first subgroup, do a scan to count (from the top down) how
+            // many elements are in the top N buckets. Find the index of the first
+            // that is over the limit. Copy it to the other invocations through
+            // shared memory.
+            if (tid < SUBGROUP_SIZE) {
+                uint partial_sum = counts[SUBGROUP_SIZE - 1 - tid];
+                partial_sum = subgroupInclusiveAdd(partial_sum) + total;
+                uint t = subgroupBallotFindLSB(subgroupBallot(partial_sum >= limit));
+                if (tid == t) {
+                    sh_min_idx = int(SUBGROUP_SIZE - 1 - t);
+                    sh_total = partial_sum;
+                }
+            }
+            barrier();
+            int min_idx = sh_min_idx;
+            total = sh_total;
+            // Update the range, and break if we've found the K-th largest.
+            range_max = range_min + ((min_idx + 1) << shift);
+            range_min = range_min + (min_idx << shift);
+            if (total == p.ncols_output) {
+                break;
+            }
+            total -= counts[min_idx];
+            mask >>= SUBGROUP_SIZE_LOG2;
+            shift -= SUBGROUP_SIZE_LOG2;
+            if (shift < 0) {
+                shift = 0;
+            }
+        }
+        ivec2 v = dst_row[tid];
+        // We need to compact these values to the start of the dst_row array.
+        // Have each subgroup count how many items it'll store, so other
+        // subgroups can compute their base offset.
+        bool top = f2ui(intBitsToFloat(v.y)) >= range_min;
+        uvec4 b = subgroupBallot(top);
+        uint bit_count = subgroupBallotBitCount(b);
+        if ((tid % SUBGROUP_SIZE) == 0) {
+            offset_partials[tid / SUBGROUP_SIZE] = bit_count;
+        }
+        barrier();
+        uint out_idx = 0;
+        [[unroll]] for (int i = 0; i < BLOCK_SIZE / SUBGROUP_SIZE; ++i) {
+            if (i < tid / SUBGROUP_SIZE) {
+                out_idx += offset_partials[i];
+            }
+        }
+        uint bit_count_ex = subgroupBallotExclusiveBitCount(b);
+        if (top) {
+            // TODO: Copy directly to the output?
+            dst_row[out_idx + bit_count_ex] = v;
+        }
+        barrier();
+    }
+    if (tid < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
+        if (p.last_pass != 0) {
+            const uint row_offset = row * p.ncols_output;
+            data_d[row_offset + tid] = dst_row[tid].x;
+        } else {
+            const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
+            data_t[row_offset + tid] = dst_row[tid];
+        }
+    }
+}
+void main() {
+    uint row = gl_WorkGroupID.y;
+    while (row < p.nrows) {
+        topk(row);
+        row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp
+#version 450
+#include "rte.glsl"
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+#define GGML_TRI_TYPE_UPPER_DIAG 0
+#define GGML_TRI_TYPE_UPPER      1
+#define GGML_TRI_TYPE_LOWER_DIAG 2
+#define GGML_TRI_TYPE_LOWER      3
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+void main() {
+    const uint idx = get_idx();
+    if (idx >= p.ne) {
+        return;
+    }
+    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02_offset = i02*p.ne01*p.ne00;
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
+    int param = floatBitsToInt(p.param1);
+    bool pass = false;
+    switch (param) {
+    case GGML_TRI_TYPE_UPPER_DIAG: pass = i00 >= i01; break;
+    case GGML_TRI_TYPE_UPPER:      pass = i00 >  i01; break;
+    case GGML_TRI_TYPE_LOWER_DIAG: pass = i00 <= i01; break;
+    case GGML_TRI_TYPE_LOWER:      pass = i00 <  i01; break;
+    }
+    if (pass) {
+        const float val = float(data_a[get_aoffset() + src0_idx(idx)]);
+        data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val);
+    } else {
+        data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0);
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp
+#version 450
+#include "generic_head.glsl"
+#include "types.glsl"
+#extension GL_EXT_control_flow_attributes : enable
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+    if (i >= p.KX) {
+        return;
+    }
+    const float x = float(data_a[i]);
+    data_d[i] = D_TYPE(trunc(x));
+}