Unverified Commit 0cf7794b authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

ggml update to b7108 (#12992)

* Revert "vulkan: temporary cary of vulkan fixes (#12971)"

This reverts commit 3a9e8e9f.

* ggml update to b7087

* fix argsort on metal

* update to b7108

* fix bakllava regression

This model lacks the metadata for the projector type.

* update to b7209

* fix TopK perf

* only build arm code on arm
parent 854d40ed
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
data_d[i] = D_TYPE(-float(data_a[i]));
}
...@@ -61,7 +61,7 @@ void quantize() { ...@@ -61,7 +61,7 @@ void quantize() {
const uint a_idx = ib * 8 + iqs; const uint a_idx = ib * 8 + iqs;
vec4 vals = a_idx < p.ne ? data_a[a_idx] : vec4(0.0f); vec4 vals = a_idx < p.ne / 4 ? data_a[a_idx] : vec4(0.0f);
const vec4 abs_vals = abs(vals); const vec4 abs_vals = abs(vals);
// Find absolute max for each block // Find absolute max for each block
......
...@@ -3,6 +3,32 @@ ...@@ -3,6 +3,32 @@
#include "generic_binary_head.glsl" #include "generic_binary_head.glsl"
#include "types.glsl" #include "types.glsl"
#if RMS_NORM_ROPE_FUSION
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
// data is passed from rms_norm -> rope through shared memory.
// rms_norm calls this data_d, rope calls this rope_data_a.
// Binding 2 is not used
shared FLOAT_TYPE rope_data_a[1024];
#define data_d rope_data_a
layout (binding = 3) readonly buffer R_Y {int rope_data_pos[];};
layout (binding = 4) readonly buffer R_Z {float rope_data_ff[];};
layout (binding = 5) writeonly buffer R_D {ROPE_D_TYPE rope_data_d[];};
layout (binding = 6) readonly buffer R_I {uvec2 rope_data_i[];}; // indices for set_rows
#include "rope_params.glsl"
#include "rope_funcs.glsl"
#define GGML_ROPE_TYPE_NORMAL 0
#define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24
#endif
#extension GL_EXT_control_flow_attributes : enable #extension GL_EXT_control_flow_attributes : enable
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
...@@ -28,8 +54,12 @@ void rms_norm(uint num_iters) { ...@@ -28,8 +54,12 @@ void rms_norm(uint num_iters) {
uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset(); uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset();
uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset(); uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset();
#if RMS_NORM_ROPE_FUSION
// Per-row offset in shared memory
uint32_t d_offset = 0;
#else
uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset(); uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset();
#endif
FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp
[[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) { [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) {
...@@ -79,6 +109,18 @@ void rms_norm(uint num_iters) { ...@@ -79,6 +109,18 @@ void rms_norm(uint num_iters) {
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));
} }
} }
#if RMS_NORM_ROPE_FUSION
barrier();
rope_params rp = p.rope;
uint rope_row = (samp*nchannels + channel)*nrows + row;
for (uint t = 2*tid; t < ncols; t += 2*BLOCK_SIZE) {
if (rp.rope_mode == GGML_ROPE_TYPE_NEOX) {
rope_neox(t, rope_row, rp);
} else if (rp.rope_mode == GGML_ROPE_TYPE_NORMAL) {
rope_norm(t, rope_row, rp);
}
}
#endif
} }
void main() { void main() {
......
float rope_yarn_ramp(const float low, const float high, const uint i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
}
uint rope_a_coord(const uint i0, const uint i01, const uint i02, rope_params p) {
#if RMS_NORM_ROPE_FUSION
// Per-row offset in shared memory
const uint ix = i0;
#else
const uint ix = i02*p.nb02 + i01*p.nb01 + i0;
#endif
return ix;
}
void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta, rope_params p) {
float mscale = p.attn_factor;
// Get n-d rotational scaling corrected for extrapolation
float theta_interp = p.freq_scale * theta_extrap;
float theta = theta_interp;
if (p.ext_factor != 0.0f) {
float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
}
// Backprogagation uses inverted rotation
if (p.is_back != 0) {
theta = -theta;
}
cos_theta = cos(theta) * mscale;
sin_theta = sin(theta) * mscale;
}
void rope_norm(const uint i0, const uint i1, rope_params p) {
uint ne0 = p.ncols;
uint ne1 = p.p_delta_rows;
if (i0 >= ne0) {
return;
}
// i1 is actually i2*nb2+i1, but the rows are contiguous
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
uint idst = i1*ne0 + i0;
const uint ix = rope_a_coord(i0, i01, i02, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS..
// The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0;
idst += rope_data_i[i02].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) {
rope_data_d[idst + 0] = ROPE_D_TYPE(rope_data_a[ix + 0]);
rope_data_d[idst + 1] = ROPE_D_TYPE(rope_data_a[ix + 1]);
return;
}
const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f);
const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
const float x0 = float(rope_data_a[ix + 0]);
const float x1 = float(rope_data_a[ix + 1]);
rope_data_d[idst + 0] = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
rope_data_d[idst + 1] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
}
void rope_neox(const uint i0, const uint i1, rope_params p) {
uint ne0 = p.ncols;
uint ne1 = p.p_delta_rows;
if (i0 >= ne0) {
return;
}
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS..
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0/2;
idst += rope_data_i[i02].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) {
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);
return;
}
const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f);
const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
const float x0 = float(rope_data_a[ix + 0]);
const float x1 = float(rope_data_a[ix + p.n_dims/2]);
rope_data_d[idst + 0] = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
rope_data_d[idst + p.n_dims/2] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
}
void rope_multi(const uint i0, const uint i1, rope_params p) {
uint ne0 = p.ncols;
uint ne1 = p.p_delta_rows;
uint ne2 = p.ne02;
if (i0 >= ne0) {
return;
}
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
const uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p);
if (i0 >= p.n_dims) {
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);
return;
}
const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
const int sec_w = p.sections[1] + p.sections[0];
const uint sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (p.is_imrope != 0) {
if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
} else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
} else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
//} else {
// theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
}
} else {
if (sector < p.sections[0]) {
theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
}
else if (sector >= p.sections[0] && sector < sec_w) {
theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
}
else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
}
else if (sector >= sec_w + p.sections[2]) {
theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
}
}
const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
const float x0 = float(rope_data_a[ix + 0]);
const float x1 = float(rope_data_a[ix + p.n_dims/2]);
rope_data_d[idst + 0] = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
rope_data_d[idst + p.n_dims/2] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
}
void rope_vision(const uint i0, const uint i1, rope_params p) {
uint ne0 = p.ncols;
uint ne1 = p.p_delta_rows;
uint ne2 = p.ne02;
if (i0 >= ne0) {
return;
}
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
const uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p);
const int sect_dims = p.sections[0] + p.sections[1];
const int sec_w = p.sections[1] + p.sections[0];
const uint sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (sector < p.sections[0]) {
const uint p0 = sector;
theta_base = rope_data_pos[i02]*pow(p.theta_scale, p0);
}
else if (sector >= p.sections[0] && sector < sec_w) {
const uint p0 = sector - p.sections[0];
theta_base = rope_data_pos[i02 + ne2]*pow(p.theta_scale, p0);
}
const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
const float x0 = float(rope_data_a[ix + 0]);
const float x1 = float(rope_data_a[ix + p.n_dims]);
rope_data_d[idst + 0] = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
rope_data_d[idst + p.n_dims] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
}
...@@ -3,55 +3,18 @@ ...@@ -3,55 +3,18 @@
#extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_16bit_storage : require
#include "rte.glsl" #include "rte.glsl"
#include "rope_params.glsl"
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in; layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; layout (binding = 0) readonly buffer X {A_TYPE rope_data_a[];};
layout (binding = 1) readonly buffer Y {int data_pos[];}; layout (binding = 1) readonly buffer Y {int rope_data_pos[];};
layout (binding = 2) readonly buffer Z {float data_ff[];}; layout (binding = 2) readonly buffer Z {float rope_data_ff[];};
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];}; layout (binding = 3) writeonly buffer D {ROPE_D_TYPE rope_data_d[];};
layout (binding = 4) readonly buffer I {uvec2 data_i[];}; // indices for set_rows layout (binding = 4) readonly buffer I {uvec2 rope_data_i[];}; // indices for set_rows
layout (push_constant) uniform parameter {
uint ncols;
uint n_dims;
float freq_scale;
uint p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[2];
float theta_scale;
uint has_ff;
uint ne02;
uint s1;
uint s2;
int sections[4];
uint is_back;
uint set_rows_stride;
} p;
float rope_yarn_ramp(const float low, const float high, const uint i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
}
void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta) { layout (push_constant) uniform parameter {
float mscale = p.attn_factor; rope_params pc;
// Get n-d rotational scaling corrected for extrapolation };
float theta_interp = p.freq_scale * theta_extrap;
float theta = theta_interp;
if (p.ext_factor != 0.0f) {
float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
}
// Backprogagation uses inverted rotation
if (p.is_back != 0) {
theta = -theta;
}
cos_theta = cos(theta) * mscale;
sin_theta = sin(theta) * mscale;
}
#version 450 #version 450
#include "rope_head.glsl" #include "rope_head.glsl"
#include "rope_funcs.glsl"
void main() { void main() {
const uint i0 = 2*gl_GlobalInvocationID.y; const uint i0 = 2*gl_GlobalInvocationID.y;
uint ne0 = p.ncols; // i1 is actually i2*nb2+i1, but the rows are contiguous
uint ne1 = p.p_delta_rows; const uint i1 = gl_GlobalInvocationID.x;
uint ne2 = p.ne02; rope_multi(i0, i1, pc);
if (i0 >= ne0) {
return;
}
const uint row_dst = gl_GlobalInvocationID.x;
const uint row_x = row_dst % ne1;
const uint channel_x = row_dst / ne1;
const uint idst = row_dst*ne0 + i0/2;
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0/2;
if (i0 >= p.n_dims) {
data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
return;
}
const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
const int sec_w = p.sections[1] + p.sections[0];
const uint sector = (i0 / 2) % sect_dims;
float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
}
else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
}
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
const float x0 = float(data_a[ix + 0]);
const float x1 = float(data_a[ix + p.n_dims/2]);
data_d[idst + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
data_d[idst + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
} }
#version 450 #version 450
#include "rope_head.glsl" #include "rope_head.glsl"
#include "rope_funcs.glsl"
void main() { void main() {
const uint i0 = 2*gl_GlobalInvocationID.y; const uint i0 = 2*gl_GlobalInvocationID.y;
uint ne0 = p.ncols; // i1 is actually i2*nb2+i1, but the rows are contiguous
uint ne1 = p.p_delta_rows; const uint i1 = gl_GlobalInvocationID.x;
rope_neox(i0, i1, pc);
if (i0 >= ne0) {
return;
}
const uint row_dst = gl_GlobalInvocationID.x;
const uint row_x = row_dst % ne1;
const uint channel_x = row_dst / ne1;
uint idst = row_dst*ne0 + i0/2;
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0/2;
// Fusion optimization: ROPE + VIEW + SET_ROWS..
// The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
if (p.set_rows_stride != 0) {
idst = row_x*ne0 + i0/2;
idst += data_i[channel_x].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) {
data_d[idst + i0/2 + 0] = D_TYPE(data_a[ix + i0/2 + 0]);
data_d[idst + i0/2 + 1] = D_TYPE(data_a[ix + i0/2 + 1]);
return;
}
const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
const float x0 = float(data_a[ix + 0]);
const float x1 = float(data_a[ix + p.n_dims/2]);
data_d[idst + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
data_d[idst + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
} }
#version 450 #version 450
#include "rope_head.glsl" #include "rope_head.glsl"
#include "rope_funcs.glsl"
void main() { void main() {
const uint i0 = 2*gl_GlobalInvocationID.y; const uint i0 = 2*gl_GlobalInvocationID.y;
uint ne0 = p.ncols; // i1 is actually i2*nb2+i1, but the rows are contiguous
uint ne1 = p.p_delta_rows; const uint i1 = gl_GlobalInvocationID.x;
rope_norm(i0, i1, pc);
if (i0 >= ne0) {
return;
}
const uint row_dst = gl_GlobalInvocationID.x;
const uint row_x = row_dst % ne1;
const uint channel_x = row_dst / ne1;
uint idst = row_dst*ne0 + i0;
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0;
// Fusion optimization: ROPE + VIEW + SET_ROWS..
// The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
if (p.set_rows_stride != 0) {
idst = row_x*ne0 + i0;
idst += data_i[channel_x].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) {
data_d[idst + 0] = D_TYPE(data_a[ix + 0]);
data_d[idst + 1] = D_TYPE(data_a[ix + 1]);
return;
}
const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
const float x0 = float(data_a[ix + 0]);
const float x1 = float(data_a[ix + 1]);
data_d[idst + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
data_d[idst + 1] = D_TYPE(x0*sin_theta + x1*cos_theta);
} }
#if !defined(GGML_ROPE_PARAMS)
#define GGML_ROPE_PARAMS
#include "rte.glsl"
struct rope_params {
uint rope_mode;
uint ncols;
uint n_dims;
float freq_scale;
uint p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[2];
float theta_scale;
uint has_ff;
uint ne02;
uint nb01;
uint nb02;
int sections[4];
uint is_imrope;
uint is_back;
uint set_rows_stride;
};
#endif // !defined(GGML_ROPE_PARAMS)
#version 450 #version 450
#include "rope_head.glsl" #include "rope_head.glsl"
#include "rope_funcs.glsl"
void main() { void main() {
const uint i0 = 2*gl_GlobalInvocationID.y; const uint i0 = 2*gl_GlobalInvocationID.y;
uint ne0 = p.ncols; // i1 is actually i2*nb2+i1, but the rows are contiguous
uint ne1 = p.p_delta_rows; const uint i1 = gl_GlobalInvocationID.x;
uint ne2 = p.ne02; rope_vision(i0, i1, pc);
if (i0 >= ne0) {
return;
}
const uint row_dst = gl_GlobalInvocationID.x;
const uint row_x = row_dst % ne1;
const uint channel_x = row_dst / ne1;
const uint idst = row_dst*ne0 + i0/2;
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0/2;
const int sect_dims = p.sections[0] + p.sections[1];
const int sec_w = p.sections[1] + p.sections[0];
const uint sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (sector < p.sections[0]) {
const uint p0 = sector;
theta_base = data_pos[channel_x]*pow(p.theta_scale, p0);
}
else if (sector >= p.sections[0] && sector < sec_w) {
const uint p0 = sector - p.sections[0];
theta_base = data_pos[channel_x + ne2]*pow(p.theta_scale, p0);
}
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
const float x0 = float(data_a[ix + 0]);
const float x1 = float(data_a[ix + p.n_dims]);
data_d[idst + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
data_d[idst + p.n_dims] = D_TYPE(x0*sin_theta + x1*cos_theta);
} }
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
const float x = float(data_a[i]);
float result;
// Round halfway cases away from zero as roundf does.
if (x >= 0.0) {
result = floor(x + 0.5);
} else {
result = ceil(x - 0.5);
}
data_d[i] = D_TYPE(result);
}
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
const float x = float(data_a[i]);
const float result = (x > 20.0f) ? x : log(1.0f + exp(x));
data_d[i] = D_TYPE(result);
}
#version 450
#include "types.glsl"
#include "generic_binary_head.glsl"
layout (constant_id = 1) const uint N = 64;
layout (constant_id = 2) const uint K = 32;
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
uint a_base, b_base, x_base;
FLOAT_TYPE get_a(uint r, uint c) {
return FLOAT_TYPE(data_a[a_base + r * p.nb01 + c * p.nb00]);
}
FLOAT_TYPE get_b(uint r, uint c) {
return FLOAT_TYPE(data_b[b_base + r * p.nb11 + c * p.nb10]);
}
void store_x(uint r, uint c, FLOAT_TYPE v) {
data_d[x_base + r * p.nb21 + c * p.nb20] = D_TYPE(v);
}
shared FLOAT_TYPE shA[N * N];
shared FLOAT_TYPE shB[N * K];
void main() {
const uint batch = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;
if (batch >= p.ne02 * p.ne03) {
return;
}
const uint i3 = batch / p.ne22;
const uint i2 = batch % p.ne22;
a_base = get_aoffset() + i2 * p.nb02 + i3 * p.nb03;
b_base = get_boffset() + i2 * p.nb12 + i3 * p.nb13;
x_base = get_doffset() + i2 * p.nb22 + i3 * p.nb23;
// Load the A matrix into shA
[[unroll]] for (uint i = 0; i < N * N; i += gl_WorkGroupSize.x) {
uint idx = i + tid;
if (((N * N) % gl_WorkGroupSize.x == 0) || idx < N * N) {
shA[idx] = get_a(idx / N, idx % N);
}
}
// Load the B matrix into shB
[[unroll]] for (uint i = 0; i < N * K; i += gl_WorkGroupSize.x) {
uint idx = i + tid;
if (((N * K) % gl_WorkGroupSize.x == 0) || idx < N * K) {
shB[idx] = get_b(idx / K, idx % K);
}
}
barrier();
FLOAT_TYPE X[N];
// Each thread solves one column
if (tid < K) {
[[unroll]] for (int r = 0; r < N; ++r) {
FLOAT_TYPE b = shB[r * K + tid];
// Compute x[r,c] = (b[r,c] - sum(a[r,c]*x[c])) / a[r,r]
[[unroll]] for (int c = 0; c < r; ++c) {
b -= shA[r * N + c] * X[c];
}
FLOAT_TYPE x = b / shA[r * N + r];
X[r] = x;
store_x(r, tid, x);
}
}
}
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
const float x = float(data_a[i]);
data_d[i] = D_TYPE(x >= 0.0f ? 1.0f : 0.0f);
}
#version 450 #version 450
#include "types.glsl" #include "types.glsl"
#include "sum_rows.glsl"
#extension GL_EXT_control_flow_attributes : enable #extension GL_EXT_control_flow_attributes : enable
...@@ -11,30 +12,6 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; ...@@ -11,30 +12,6 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (push_constant) uniform parameter
{
uint n_cols;
uint ne01, ne02;
uint nb01, nb02, nb03;
uint nb11, nb12, nb13;
float weight;
uint misalign_offsets;
uint ne0_12mp, ne0_12L;
uint ne0_1mp, ne0_1L;
} p;
uint get_aoffset() { return p.misalign_offsets >> 16; }
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
// see init_fastdiv_values in ggml-vulkan.cpp
uint fastdiv(uint n, uint mp, uint L) {
uint msbs, lsbs;
// msbs = mulhi(n, mp)
umulExtended(n, mp, msbs, lsbs);
return (msbs + n) >> L;
}
shared FLOAT_TYPE tmp[BLOCK_SIZE]; shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() { void main() {
......
// vk_op_sum_rows_push_constants
layout (push_constant) uniform parameter
{
uint n_cols;
uint ne01, ne02;
uint nb01, nb02, nb03;
uint nb11, nb12, nb13;
float weight;
uint misalign_offsets;
uint ne0_12mp, ne0_12L;
uint ne0_1mp, ne0_1L;
} p;
uint get_aoffset() { return p.misalign_offsets >> 16; }
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
// see init_fastdiv_values in ggml-vulkan.cpp
uint fastdiv(uint n, uint mp, uint L) {
uint msbs, lsbs;
// msbs = mulhi(n, mp)
umulExtended(n, mp, msbs, lsbs);
return (msbs + n) >> L;
}
#version 450
#extension GL_EXT_control_flow_attributes : enable
#include "types.glsl"
layout(constant_id = 0) const int BLOCK_SIZE = 1024;
layout(constant_id = 1) const int NCOLS_PADDED_LOG2 = 10;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
// Input can either be the source (A) or intermediate values (S).
// Similarly, output can be either destination (D) or intermediate values (S).
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 0) readonly buffer S {ivec2 data_s[];};
layout (binding = 1) writeonly buffer D {int data_d[];};
layout (binding = 1) writeonly buffer T {ivec2 data_t[];};
layout (push_constant) uniform parameter {
uint orig_ncols;
uint ncols_input;
uint ncols_output;
uint nrows;
uint first_pass;
uint last_pass;
} p;
// pairs of (gid, value)
shared ivec2 dst_row[BLOCK_SIZE];
void topk(bool needs_bounds_check, const uint row) {
const int col = int(gl_LocalInvocationID.x);
// initialize indices
if (gl_GlobalInvocationID.x < p.ncols_input) {
if (p.first_pass != 0) {
const uint row_offset = row * p.ncols_input;
dst_row[col] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
} else {
const uint row_offset = row * p.orig_ncols;
dst_row[col] = data_s[row_offset + gl_GlobalInvocationID.x];
}
} else {
dst_row[col] = ivec2(p.orig_ncols, 0);
}
barrier();
if (p.ncols_output == 1) {
// Fast path for single output - just do a max reduction
[[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
if (col < s) {
ivec2 a = dst_row[col];
ivec2 b = dst_row[col + s];
if (a.x >= p.orig_ncols ||
b.x < p.orig_ncols && b.y > a.y) {
dst_row[col] = b;
}
}
barrier();
}
} else {
// bitonic sort on this group of elements
uint num_outer_loop_iters = NCOLS_PADDED_LOG2;
for (uint k = 2, outer_idx = 0; outer_idx < num_outer_loop_iters; k *= 2, outer_idx++) {
uint num_inner_loop_iters = outer_idx + 1;
for (uint j = k / 2, inner_idx = 0; inner_idx < num_inner_loop_iters; j /= 2, inner_idx++) {
const int ixj = int(col ^ j);
int idx_0 = (col & k) == 0 ? col : ixj;
int idx_1 = (col & k) == 0 ? ixj : col;
ivec2 sh_idx_0 = dst_row[idx_0];
ivec2 sh_idx_1 = dst_row[idx_1];
bool idx_0_oob = needs_bounds_check ? sh_idx_0.x >= p.orig_ncols : false;
bool idx_1_oob = needs_bounds_check ? sh_idx_1.x >= p.orig_ncols : false;
if ((idx_0_oob ||
(!idx_1_oob && intBitsToFloat(sh_idx_0.y) < intBitsToFloat(sh_idx_1.y))) && (ixj > col)) {
dst_row[idx_0] = sh_idx_1;
dst_row[idx_1] = sh_idx_0;
}
barrier();
}
}
}
if (col < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
if (p.last_pass != 0) {
const uint row_offset = row * p.ncols_output;
data_d[row_offset + col] = dst_row[col].x;
} else {
const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
data_t[row_offset + col] = dst_row[col];
}
}
}
void main() {
// Fast path for fully occupied workgroups
if ((p.ncols_input % BLOCK_SIZE) == 0) {
uint row = gl_WorkGroupID.y;
while (row < p.nrows) {
topk(false, row);
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
}
} else {
uint row = gl_WorkGroupID.y;
while (row < p.nrows) {
topk(true, row);
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
}
}
}
#version 450
#extension GL_EXT_control_flow_attributes : enable
#extension GL_EXT_debug_printf : enable
#extension GL_KHR_shader_subgroup_basic : enable
#extension GL_KHR_shader_subgroup_ballot : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_shuffle : enable
#include "types.glsl"
layout(constant_id = 0) const int BLOCK_SIZE = 1024;
layout(constant_id = 1) const int SUBGROUP_SIZE = 32;
layout(constant_id = 2) const int SUBGROUP_SIZE_LOG2 = 5;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
// Input can either be the source (A) or intermediate values (S).
// Similarly, output can be either destination (D) or intermediate values (S).
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 0) readonly buffer S {ivec2 data_s[];};
layout (binding = 1) writeonly buffer D {int data_d[];};
layout (binding = 1) writeonly buffer T {ivec2 data_t[];};
layout (push_constant) uniform parameter {
uint orig_ncols;
uint ncols_input;
uint ncols_output;
uint nrows;
uint first_pass;
uint last_pass;
} p;
// pairs of (gid, value)
shared ivec2 dst_row[BLOCK_SIZE];
shared int counts[SUBGROUP_SIZE];
shared int sh_min_idx;
shared uint sh_total;
shared uint offset_partials[BLOCK_SIZE / SUBGROUP_SIZE];
// Map float values to uint such that comparisons still work.
// Positive values set the high bit, negative values are inverted.
// +0.0 -> 0x80000000, -0.0 -> 0x7FFFFFFF are in the correct places.
uint f2ui(float x) {
uint y = floatBitsToUint(x);
if ((y & 0x80000000) != 0) {
y ^= ~0;
} else {
y |= 0x80000000;
}
return y;
}
void topk(const uint row) {
const int tid = int(gl_LocalInvocationID.x);
// initialize indices
if (gl_GlobalInvocationID.x < p.ncols_input) {
if (p.first_pass != 0) {
const uint row_offset = row * p.ncols_input;
dst_row[tid] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
} else {
const uint row_offset = row * p.orig_ncols;
dst_row[tid] = data_s[row_offset + gl_GlobalInvocationID.x];
}
} else {
dst_row[tid] = ivec2(p.orig_ncols, 0xFF800000); // -inf
}
barrier();
if (p.ncols_output == 1) {
// Fast path for single output - just do a max reduction
[[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
if (tid < s) {
ivec2 a = dst_row[tid];
ivec2 b = dst_row[tid + s];
if (a.x >= p.orig_ncols ||
b.x < p.orig_ncols && b.y > a.y) {
dst_row[tid] = b;
}
}
barrier();
}
} else {
// Do an N-ary search to find the K-th largest value.
// We remap the float values to be comparable as unsigned integers,
// and split the range into 2^N smaller ranges where N is the
// subgroup size. Count how many values are in each range, if the K-th
// largest value is in the middle of one of thee ranges then repeat
// and split again.
// Mask is the current set of bits we're searching. Shift is the LSB index.
int shift = 32 - SUBGROUP_SIZE_LOG2;
uint mask = ((1 << SUBGROUP_SIZE_LOG2) - 1) << shift;
// The current range.
uint range_min = 0;
uint range_max = 0xFF800000;
// How many are above the current range, and how many we need to find.
uint total = 0;
uint limit = min(p.ncols_output, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);
while (mask != 0) {
barrier();
// Initialize bucket counts to zero.
if (tid < SUBGROUP_SIZE) {
counts[tid] = 0;
}
barrier();
// Count how many values are in each bucket.
if (tid < p.ncols_input) {
float y = intBitsToFloat(dst_row[tid].y);
uint fy = f2ui(y);
if (fy >= range_min && fy < range_max) {
uint bucket = (fy & mask) >> shift;
atomicAdd(counts[bucket], 1);
}
}
barrier();
// On the first subgroup, do a scan to count (from the top down) how
// many elements are in the top N buckets. Find the index of the first
// that is over the limit. Copy it to the other invocations through
// shared memory.
if (tid < SUBGROUP_SIZE) {
uint partial_sum = counts[SUBGROUP_SIZE - 1 - tid];
partial_sum = subgroupInclusiveAdd(partial_sum) + total;
uint t = subgroupBallotFindLSB(subgroupBallot(partial_sum >= limit));
if (tid == t) {
sh_min_idx = int(SUBGROUP_SIZE - 1 - t);
sh_total = partial_sum;
}
}
barrier();
int min_idx = sh_min_idx;
total = sh_total;
// Update the range, and break if we've found the K-th largest.
range_max = range_min + ((min_idx + 1) << shift);
range_min = range_min + (min_idx << shift);
if (total == p.ncols_output) {
break;
}
total -= counts[min_idx];
mask >>= SUBGROUP_SIZE_LOG2;
shift -= SUBGROUP_SIZE_LOG2;
if (shift < 0) {
shift = 0;
}
}
ivec2 v = dst_row[tid];
// We need to compact these values to the start of the dst_row array.
// Have each subgroup count how many items it'll store, so other
// subgroups can compute their base offset.
bool top = f2ui(intBitsToFloat(v.y)) >= range_min;
uvec4 b = subgroupBallot(top);
uint bit_count = subgroupBallotBitCount(b);
if ((tid % SUBGROUP_SIZE) == 0) {
offset_partials[tid / SUBGROUP_SIZE] = bit_count;
}
barrier();
uint out_idx = 0;
[[unroll]] for (int i = 0; i < BLOCK_SIZE / SUBGROUP_SIZE; ++i) {
if (i < tid / SUBGROUP_SIZE) {
out_idx += offset_partials[i];
}
}
uint bit_count_ex = subgroupBallotExclusiveBitCount(b);
if (top) {
// TODO: Copy directly to the output?
dst_row[out_idx + bit_count_ex] = v;
}
barrier();
}
if (tid < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
if (p.last_pass != 0) {
const uint row_offset = row * p.ncols_output;
data_d[row_offset + tid] = dst_row[tid].x;
} else {
const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
data_t[row_offset + tid] = dst_row[tid];
}
}
}
void main() {
uint row = gl_WorkGroupID.y;
while (row < p.nrows) {
topk(row);
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
}
}
#version 450
#include "rte.glsl"
#include "types.glsl"
#include "generic_unary_head.glsl"
#define GGML_TRI_TYPE_UPPER_DIAG 0
#define GGML_TRI_TYPE_UPPER 1
#define GGML_TRI_TYPE_LOWER_DIAG 2
#define GGML_TRI_TYPE_LOWER 3
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
const uint i02_offset = i02*p.ne01*p.ne00;
const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
int param = floatBitsToInt(p.param1);
bool pass = false;
switch (param) {
case GGML_TRI_TYPE_UPPER_DIAG: pass = i00 >= i01; break;
case GGML_TRI_TYPE_UPPER: pass = i00 > i01; break;
case GGML_TRI_TYPE_LOWER_DIAG: pass = i00 <= i01; break;
case GGML_TRI_TYPE_LOWER: pass = i00 < i01; break;
}
if (pass) {
const float val = float(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val);
} else {
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0);
}
}
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
const float x = float(data_a[i]);
data_d[i] = D_TYPE(trunc(x));
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment