update llama.cpp to e782c9e735f93ab4767ffc37462c523b73a17ddc

a83eaa7a · Michael Yang · 5156e48c · a83eaa7a · a83eaa7a · a83eaa7a
Commit a83eaa7a authored Jul 19, 2023 by Michael Yang
12 changed files
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -39,6 +39,8 @@
 #include "ggml-cuda.h"
 #include "ggml.h"
+#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -100,7 +102,7 @@ typedef void (*ggml_cuda_op_t)(
 #define QK4_0 32
 #define QR4_0 2
-#define QI4_0 4
+#define QI4_0 (QK4_0 / (4 * QR4_0))
 typedef struct {
    half    d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
@@ -109,7 +111,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
 #define QK4_1 32
 #define QR4_1 2
-#define QI4_1 4
+#define QI4_1 (QK4_1 / (4 * QR4_1))
 typedef struct {
    half    d;              // delta
    half    m;              // min
@@ -119,7 +121,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
 #define QK5_0 32
 #define QR5_0 2
-#define QI5_0 4
+#define QI5_0 (QK5_0 / (4 * QR5_0))
 typedef struct {
    half d;                 // delta
    uint8_t qh[4];          // 5-th bit of quants
@@ -129,7 +131,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
 #define QK5_1 32
 #define QR5_1 2
-#define QI5_1 4
+#define QI5_1 (QK5_1 / (4 * QR5_1))
 typedef struct {
    half d;                 // delta
    half m;                 // min
@@ -140,7 +142,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
 #define QK8_0 32
 #define QR8_0 1
-#define QI8_0 8
+#define QI8_0 (QK8_0 / (4 * QR8_0))
 typedef struct {
    half    d;              // delta
    int8_t  qs[QK8_0];      // quants
@@ -149,7 +151,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
 #define QK8_1 32
 #define QR8_1 1
-#define QI8_1 8
+#define QI8_1 (QK8_1 / (4 * QR8_1))
 typedef struct {
    half    d;              // delta
    half    s;              // unquantized sum
@@ -169,6 +171,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
 #define K_SCALE_SIZE 12
 #endif
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
@@ -177,6 +181,8 @@ typedef struct {
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
 typedef struct {
    uint8_t hmask[QK_K/8];     // quants - high bit
    uint8_t qs[QK_K/4];        // quants - low 2 bits
@@ -189,6 +195,8 @@ typedef struct {
 } block_q3_K;
 //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
 #ifdef GGML_QKK_64
 typedef struct {
    half    d[2];              // super-block scales/mins
@@ -206,6 +214,8 @@ typedef struct {
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
 #endif
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
 #ifdef GGML_QKK_64
 typedef struct {
    half d;                  // super-block scale
@@ -225,6 +235,8 @@ typedef struct {
 static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 #endif
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
 typedef struct {
    uint8_t ql[QK_K/2];   // quants, lower 4 bits
    uint8_t qh[QK_K/4];   // quants, upper 2 bits
@@ -238,6 +250,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_ADD_BLOCK_SIZE 256
 #define CUDA_MUL_BLOCK_SIZE 256
+#define CUDA_GELU_BLOCK_SIZE 256
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_CPY_BLOCK_SIZE 32
 #define CUDA_SCALE_BLOCK_SIZE 256
@@ -265,13 +278,13 @@ struct ggml_tensor_extra_gpu {
    cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
 };
-static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
+    if (i >= kx) {
        return;
    }
-    dst[i] = x[i] + y[i];
+    dst[i] = x[i] + y[i%ky];
 }
 static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -292,6 +305,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
    dst[i] = x[i] * y[i%ky];
 }
+static __global__ void gelu_f32(const float * x, float * dst, const int k) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+    if (i >= k) {
+        return;
+    }
+    float xi = x[i];
+    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
+}
 static __global__ void silu_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -301,16 +327,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
    dst[i] = x[i] / (1.0f + expf(-x[i]));
 }
+static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+    const float eps = 1e-5f;
+    float mean = 0.0f;
+    float var = 0.0f;
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        const float xi = x[row*ncols + col];
+        mean += xi;
+        var += xi * xi;
+    }
+    // sum up partial sums
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
+        var += __shfl_xor_sync(0xffffffff, var, mask, 32);
+    }
+    mean /= ncols;
+    var = var / ncols - mean * mean;
+    const float inv_var = rsqrtf(var + eps);
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
+    }
+}
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;
-    const float eps = 1e-6;
+    const float eps = 1e-6f;
    float tmp = 0.0f; // partial sum for thread in warp
-    for (int i = 0; i < ncols; i += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
-        const int col = i + tid;
        const float xi = x[row*ncols + col];
        tmp += xi * xi;
    }
@@ -322,10 +378,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
    }
    const float mean = tmp / ncols;
-    const float scale = 1.0f / sqrtf(mean + eps);
+    const float scale = rsqrtf(mean + eps);
-    for (int i = 0; i < ncols; i += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
-        const int col = i + tid;
        dst[row*ncols + col] = scale * x[row*ncols + col];
    }
 }
@@ -1254,8 +1309,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
    y[iybs + iqs + y_offset] = v.y;
 }
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
    int vi;
@@ -1276,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
    return sumi*d;
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
    const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1301,11 +1358,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
    return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
    int qs;
@@ -1336,11 +1394,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
    return sumi*d;
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
    const int qs  = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1370,11 +1429,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
    return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
    int vi;
@@ -1389,7 +1449,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
    return sumi*d;
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+    const float    d = bq2_K->d;
+    const float dmin = bq2_K->dmin;
+    const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = bq2_K->scales[scale_offset + 2*i];
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const float d8i = bq8i->d;
+        const int vi = (v >> (2*i)) & 0x03030303;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * (sc & 0xF)); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >>  4)); // multiply constant q2_K part with sum of q8_1 values
+    }
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+    float sumf = 0.0f;
+    const float d = bq3_K->d;
+    int vl;
+    memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
+    int vh;
+    memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
+    vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
+    vh >>= bq8_offset;
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+        const int sc = (sc_low | sc_high) - 32;
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+        const int vil = (vl >> (2*i)) & 0x03030303;
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+        const int vi = __vsubss4(vil, vih);
+        sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+    }
+    return d*sumf;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+    const int bq8_offset = QR4_K * (iqs / QI8_1);
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+    const float    d = bq4_K->d;
+    const float dmin = bq4_K->dmin;
+    const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
+    for (int i = 0; i < QR4_K; ++i) {
+        const int isc = bq8_offset + i;
+        uint8_t sc, m;
+        get_scale_min_k4(isc, bq4_K->scales, sc, m);
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+        const int vi = (v >> (4*i)) & 0x0F0F0F0F;
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+    const int bq8_offset = QR5_K * (iqs / QI8_1);
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+    const float    d = bq5_K->d;
+    const float dmin = bq5_K->dmin;
+    const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
+    const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
+    for (int i = 0; i < QR5_K; ++i) {
+        const int isc = bq8_offset + i;
+        uint8_t sc, m;
+        get_scale_min_k4(isc, bq5_K->scales, sc, m);
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+        const int vih = ((vh >> i) << 4) & 0x10101010;
+        const int vi = vil | vih;
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q5_K with sum of q8_1 values
+    }
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+    float sumf = 0.0f;
+    const float d = bq6_K->d;
+    int vl;
+    memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
+    int vh;
+    memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = bq6_K->scales[scale_offset + 4*i];
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
+        const float d8i = bq8i->d;
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+        const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+        sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+    }
+    return d*sumf;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -1412,7 +1685,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
        const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
-        const int iby = i + threadIdx.x / qi; // y block index
+        const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
        const int iqs  = threadIdx.x % qi; // x block quant index when casting the quants to int
@@ -1650,6 +1923,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
+static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
+    const int half_n_dims = ncols/4;
+    if (col >= half_n_dims) {
+        return;
+    }
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i = row*ncols + col;
+    const float col_theta_scale = powf(theta_scale, col);
+    const float theta = p*col_theta_scale;
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);
+    const float x0 = x[i + 0];
+    const float x1 = x[i + half_n_dims];
+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
+    const float block_theta = block_p*col_theta_scale;
+    const float sin_block_theta = sinf(block_theta);
+    const float cos_block_theta = cosf(block_theta);
+    const float x2 = x[i + half_n_dims * 2];
+    const float x3 = x[i + half_n_dims * 3];
+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
+}
 static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1715,9 +2022,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
    dst[i] = scale * x[i];
 }
-static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
-    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 }
 static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -1730,11 +2037,22 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
    mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 }
+static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
 static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
+static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+}
 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    const dim3 block_dims(WARP_SIZE, 1, 1);
@@ -1900,7 +2218,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
 }
 static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK4_0 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1909,7 +2227,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
 }
 static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK4_1 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1918,7 +2236,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
 }
 static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK5_0 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1927,7 +2245,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
 }
 static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK5_1 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1936,7 +2254,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
 }
 static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK8_0 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1944,6 +2262,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
 static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
    dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2036,6 +2399,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
 }
+static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
+    GGML_ASSERT(nrows % 4 == 0);
+    const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(num_blocks_x, nrows, 1);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
+}
 static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
    const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2263,16 +2634,19 @@ inline void ggml_cuda_op_add(
    GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i  != nullptr);
-    const int64_t ne0 = src0->ne[0];
+    const int64_t ne00 = src0->ne[0];
    const int64_t i01_diff = i01_high - i01_low;
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
    // compute
    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
+        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
+        add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
    } else {
        GGML_ASSERT(false);
    }
@@ -2291,27 +2665,41 @@ inline void ggml_cuda_op_mul(
    GGML_ASSERT(src0_ddf_i != nullptr);
    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i  != nullptr);
    const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
-    for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
+    mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
-        const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
-        float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
+    (void) dst;
-        float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
+    (void) src0_ddq_i;
-        float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
+    (void) i02;
+}
-        // compute
+inline void ggml_cuda_op_gelu(
-        mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
-    }
+    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+    cudaStream_t & cudaStream_main){
+    GGML_ASSERT(src0_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i != nullptr);
+    const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
+    // compute
+    gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
+    (void) src1;
    (void) dst;
    (void) src0_ddq_i;
+    (void) src1_ddf_i;
    (void) i02;
+    (void) i1;
 }
 inline void ggml_cuda_op_silu(
@@ -2336,6 +2724,28 @@ inline void ggml_cuda_op_silu(
    (void) i1;
 }
+inline void ggml_cuda_op_norm(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+    cudaStream_t & cudaStream_main){
+    GGML_ASSERT(src0_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i != nullptr);
+    const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
+    // compute
+    norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
+    (void) src1;
+    (void) dst;
+    (void) src0_ddq_i;
+    (void) src1_ddf_i;
+    (void) i02;
+    (void) i1;
+}
 inline void ggml_cuda_op_rms_norm(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2376,13 +2786,22 @@ inline void ggml_cuda_op_mul_mat_vec(
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
+    bool mul_mat_vec_q_implemented =
+        src0->type == GGML_TYPE_Q4_0 ||
        src0->type == GGML_TYPE_Q4_1 ||
        src0->type == GGML_TYPE_Q5_0 ||
        src0->type == GGML_TYPE_Q5_1 ||
        src0->type == GGML_TYPE_Q8_0;
+#if QK_K == 256
-    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
+    mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
+        src0->type == GGML_TYPE_Q2_K ||
+        src0->type == GGML_TYPE_Q3_K ||
+        src0->type == GGML_TYPE_Q4_K ||
+        src0->type == GGML_TYPE_Q5_K ||
+        src0->type == GGML_TYPE_Q6_K;
+#endif // QK_K == 256
+    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
 #endif
    if (use_mul_mat_vec_q) {
@@ -2408,6 +2827,21 @@ inline void ggml_cuda_op_mul_mat_vec(
            case GGML_TYPE_Q8_0:
                mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
                break;
+            case GGML_TYPE_Q2_K:
+                mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q3_K:
+                mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q4_K:
+                mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q5_K:
+                mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q6_K:
+                mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
            default:
                GGML_ASSERT(false);
                break;
@@ -2542,13 +2976,21 @@ inline void ggml_cuda_op_rope(
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
-    GGML_ASSERT(mode == 0);
+    const int n_ctx  = ((int32_t *) src1->data)[3];
    const float theta_scale = powf(10000.0, -2.0f/n_dims);
    const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
+    bool is_glm = mode & 4;
    // compute
-    rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
+    if (is_glm) {
+        const float id_p = min(p, n_ctx - 2.f);
+        const float block_p = max(p - (n_ctx - 2.f), 0.f);
+        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
+    } else {
+        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
+    }
    (void) dst;
    (void) src0_ddq_i;
@@ -2951,11 +3393,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
 }
+void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
+}
 void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
 }
+void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
+}
 void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -3111,6 +3563,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
    (void) dst;
 }
+void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
 void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -3186,7 +3643,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
        }
-        cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
+        CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
        extra->data_device[id] = buf;
@@ -3220,6 +3677,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
    delete extra;
 }
+static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+    if (g_temp_tensor_extras == nullptr) {
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+    }
+    size_t alloc_index = g_temp_tensor_extra_index;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+    struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    memset(extra, 0, sizeof(*extra));
+    return extra;
+}
 void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
    if (scratch && g_scratch_size == 0) {
        return;
@@ -3228,7 +3701,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
    // recursively assign CUDA buffers until a compute tensor is found
    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
        const ggml_op src0_op = tensor->src[0]->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
        }
    }
@@ -3237,8 +3710,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
    }
    tensor->backend = GGML_BACKEND_GPU;
-    struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
+    struct ggml_tensor_extra_gpu * extra;
-    memset(extra, 0, sizeof(*extra));
    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
        tensor->op == GGML_OP_VIEW ||
@@ -3253,10 +3725,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
        if (tensor->op == GGML_OP_VIEW) {
            memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
        extra->data_device[g_main_device] = src0_ddc + offset;
    } else if (tensor->op == GGML_OP_CPY) {
        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
        void * src1_ddv = src1_extra->data_device[g_main_device];
+        extra = ggml_cuda_alloc_temp_tensor_extra();
        extra->data_device[g_main_device] = src1_ddv;
    } else if (scratch) {
        GGML_ASSERT(size <= g_scratch_size);
@@ -3269,6 +3743,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
            CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
            g_scratch_buffer = data;
        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
        extra->data_device[g_main_device] = data + g_scratch_offset;
        g_scratch_offset += size;
@@ -3278,6 +3753,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
        void * data;
        CUDA_CHECK(cudaMalloc(&data, size));
        CUDA_CHECK(cudaMemset(data, 0, size));
+        extra = new ggml_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
        extra->data_device[g_main_device] = data;
    }
@@ -3330,6 +3807,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
    switch (tensor->op) {
+        case GGML_OP_DUP:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_dup;
+            break;
        case GGML_OP_ADD:
            if (!any_on_device) {
                return false;
@@ -3342,12 +3825,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            }
            func = ggml_cuda_mul;
            break;
+        case GGML_OP_GELU:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_gelu;
+            break;
        case GGML_OP_SILU:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_silu;
            break;
+        case GGML_OP_NORM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_norm;
+            break;
        case GGML_OP_RMS_NORM:
            if (!any_on_device) {
                return false;
@@ -3372,6 +3867,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            }
            func = ggml_cuda_cpy;
            break;
+        case GGML_OP_CONT:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_dup;
+            break;
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:

--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *

--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *

--- a/llama/ggml-metal.m
+++ b/llama/ggml-metal.m
 // +build darwin
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 4;
+                                            nth0 = 2;
-                                            nth1 = 16;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 4;
+                                            nth0 = 2;
-                                            nth1 = 16;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
                                        } break;
                                    case GGML_TYPE_Q6_K:
@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 4;
+                                            nth0 = 2;
-                                            nth1 = 16;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
                                        } break;
                                    default:
@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
-                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
+                                    src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q5_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q6_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q2_K ||
-                                         src0t == GGML_TYPE_Q3_K ||
+                                         src0t == GGML_TYPE_Q3_K) {
-                                         src0t == GGML_TYPE_Q4_K ||
-                                         src0t == GGML_TYPE_Q5_K ||
-                                         src0t == GGML_TYPE_Q6_K) {
                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(
                            const float eps = 1e-6f;
-                            const int nth = 256;
+                            const int nth = 512;
                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
                            const int64_t nrows = ggml_nrows(src0);
@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(
                            const int n_past = ((int32_t *)(src1->data))[0];
+                            float freq_base;
+                            float freq_scale;
+                            memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+                            memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
                            [encoder setComputePipelineState:ctx->pipeline_rope];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
-                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;

--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -357,26 +357,33 @@ kernel void kernel_rms_norm(
        threadgroup float  * sum [[threadgroup(0)]],
        uint tgpig[[threadgroup_position_in_grid]],
        uint tpitg[[thread_position_in_threadgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]],
+        uint tiisg[[thread_index_in_simdgroup]],
        uint   ntg[[threads_per_threadgroup]]) {
-    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
+    device const float * x_scalar = (device const float *) x;
+    float4 sumf=0;
+    float all_sum=0;
    // parallel sum
-    sum[tpitg] = 0.0f;
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        sumf += x[i00] * x[i00];
-        sum[tpitg] += x[i00] * x[i00];
+    }
+    all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
+    all_sum = simd_sum(all_sum);
+    if (tiisg == 0) {
+        sum[sgitg] = all_sum;
    }
-    // reduce
    threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = ntg/2; i > 0; i /= 2) {
+    // broadcast, simd group number is ntg / 32
-        if (tpitg < i) {
+    for (int i = ntg / 32 / 2; i > 0; i /= 2) {
-            sum[tpitg] += sum[tpitg + i];
+       if (tpitg < i) {
-        }
+           sum[tpitg] += sum[tpitg + i];
-        threadgroup_barrier(mem_flags::mem_threadgroup);
+       }
    }
-    // broadcast
    if (tpitg == 0) {
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
        sum[0] /= ne00;
    }
@@ -385,147 +392,127 @@ kernel void kernel_rms_norm(
    const float mean  = sum[0];
    const float scale = 1.0f/sqrt(mean + eps);
-    device float * y = dst + tgpig*ne00;
+    device float4 * y = (device float4 *) (dst + tgpig*ne00);
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+    device float * y_scalar = (device float *) y;
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
        y[i00] = x[i00] * scale;
    }
+    if (tpitg == 0) {
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
+    }
 }
-kernel void kernel_mul_mat_q4_0_f32(
+// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
-        device const  void * src0,
+float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
-        device const float * src1,
+    float d = qb_curr->d;
-        device       float * dst,
+    float4 acc = 0.f;
-        constant   int64_t & ne00,
+    device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
-        constant   int64_t & ne10,
+    for (int i = 0; i < 16; i+=2) {
-        constant   int64_t & ne0,
+        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
-        threadgroup float  * sum [[threadgroup(0)]],
+        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
-        uint2 tgpig[[threadgroup_position_in_grid]],
+        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
-        uint2 tpitg[[thread_position_in_threadgroup]],
+        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
-        uint2  tptg[[threads_per_threadgroup]]) {
+    }
-    const int nb = ne00/QK4_0;
+    return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
+}
-    const int64_t r0 = tgpig.x;
+// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
-    const int64_t r1 = tgpig.y;
+float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
+    float d = qb_curr->d;
+    float m = qb_curr->m;
+    float4 acc = 0.f;
+    device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
+    for (int i = 0; i < 16; i+=2) {
+        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
+        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
+        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
+        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+    }
+    return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
+}
-    device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
+// putting them in the kernel cause a significant performance penalty
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
+template<typename block_q_type>
+void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
+                    int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
+                    uint2 tgpig, uint tiisg, uint sgitg) {
+    const int nb = ne00/QK4_0;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;
+    float4 y_curr[8];       // src1 vector cache
-    const int nth = tptg.x*tptg.y;
+    float sumf[N_DST]={0.f}, all_sum;
-    const int ith = tptg.y*tpitg.x + tpitg.y;
+    thread float * yl=(thread float *)y_curr;
-    const int ix = tpitg.y/4;           // 0 or 1
+    // each thread in a SIMD group deals with 1 block.
-    const int iy = tpitg.y - 4*ix;      // 0...3
+    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
+        float sumy = 0;
-    const int first = 4 * iy;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
-    float sumf = 0;
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
-        const float d = (float)x[i].d;
-        device const uint8_t * xl = x[i].qs + first;
-        device const float   * yl = y + i * QK4_0 + first;
-        float2 acc = {0.0f, 0.0f};
-        for (int j = 0; j < 4; ++j) {
-            acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
-            acc[1] += yl[j] + yl[j+16];
        }
-        sumf += d * (acc[0] - 8.f*acc[1]);
+        for (int row = 0; row < N_DST; row++) {
+            sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
+        }
    }
-    sum[ith] = sumf;
+    // from now loads two rows every time and 16 blocks per row
+    int ir = tiisg / (N_SIMDWIDTH / 2);
+    int ib = tiisg % (N_SIMDWIDTH / 2);
+    for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
+        int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+        }
-    //
+        for (int row = 0; row < N_DST; row+=2) {
-    // Accumulate the sum from all threads in the threadgroup
+            if (nb_start + ib < nb) {
-    //
+                sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+            }
-    if (ith%4 == 0) {
+        }
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
+    for (int row = 0; row < N_DST; ++row) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
+        all_sum = simd_sum(sumf[row]);
-        dst[r1*ne0 + r0] = sum[0];
+        if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+        }
    }
 }
-kernel void kernel_mul_mat_q4_1_f32(
+kernel void kernel_mul_mat_q4_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
+        constant   int64_t & ne01[[buffer(4)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
+        uint tiisg[[thread_index_in_simdgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int nb = ne00/QK4_1;
+    mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+}
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
-    device const float      * y = (device const float      *) src1 + r1*ne10;
-    const uint nth = tptg.x*tptg.y;
-    const uint ith = tptg.y*tpitg.x + tpitg.y;
-    const int ix = tpitg.y/4;           // 0 or 1
-    const int iy = tpitg.y - 4*ix;      // 0...3
-    const int first = 4 * iy;
-    float sumf = 0;
-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
-        const float d = (float)x[i].d;
-        const float m = (float)x[i].m;
-        device const uint8_t * xl = x[i].qs + first;
-        device const float   * yl = y + i * QK4_1 + first;
-        float2 acc = {0.0f, 0.0f};
-        for (int j = 0; j < 4; ++j) {
-            acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
-            acc[1] += yl[j+16] * (d * (xl[j] >>  4) + m);
-        }
-        sumf += acc[0] + acc[1];
-    }
-    sum[ith] = sumf;
-    //
+kernel void kernel_mul_mat_q4_1_f32(
-    // Accumulate the sum from all threads in the threadgroup
+        device const  void * src0,
-    //
+        device const float * src1,
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        device       float * dst,
-    if (ith%4 == 0) {
+        constant   int64_t & ne00,
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
+        constant   int64_t & ne10,
-    }
+        constant   int64_t & ne0,
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        constant   int64_t & ne01[[buffer(4)]],
-    if (ith%16 == 0) {
+        uint2 tgpig[[threadgroup_position_in_grid]],
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
+        uint tiisg[[thread_index_in_simdgroup]],
-    }
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+     mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
-    if (ith == 0) {
-        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
-    }
 }
 kernel void kernel_mul_mat_f16_f32(
@@ -641,17 +628,19 @@ kernel void kernel_rope(
        constant       int & n_past,
        constant       int & n_dims,
        constant       int & mode,
+        constant     float & freq_base,
+        constant     float & freq_scale,
        uint3 tpig[[thread_position_in_grid]]) {
    const int64_t i3 = tpig[2];
    const int64_t i2 = tpig[1];
    const int64_t i1 = tpig[0];
    const bool is_neox = mode & 2;
-    const float theta_scale = pow(10000.0, -2.0f/n_dims);
+    const float theta_scale = pow(freq_base, -2.0f/n_dims);
    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
-    float theta = (float)p;
+    float theta = freq_scale * (float)p;
    if (!is_neox) {
        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
@@ -1489,6 +1478,7 @@ kernel void kernel_mul_mat_q3_K_f32(
 }
+#if QK_K == 256
 kernel void kernel_mul_mat_q4_K_f32(
        device const  void * src0,
        device const float * src1,
@@ -1496,131 +1486,180 @@ kernel void kernel_mul_mat_q4_K_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
+        constant   int64_t & ne01[[buffer(4)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
+        uint tiisg[[thread_index_in_simdgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int nb = ne00/QK_K;
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int nth = tptg.x*tptg.y;
-    const int ith = tptg.y*tpitg.x + tpitg.y;
-    device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
-    device const float     * yy = (device const float      *) src1 + r1*ne10;
-    float sumf = 0;
-#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
-    const int tid = tpitg.y;   // 0...16
+    const int ix = tiisg/8;  // 0...3
-    const int il  = tid/4;     // 0...3
+    const int it = tiisg%8;  // 0...7
-    const int ir  = tid - 4*il;// 0...3
+    const int im = it/4;     // 0 or 1
-    const int n   = 4;
+    const int ir = it%4;     // 0...3
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int nb = ne00/QK_K;
-    const int in = il%2;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int ib_row = first_row * nb;
+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
+    device const float      * y = (device const float      *) src1 + r1*ne10;
+    float yl[16];
+    float yh[16];
+    float sumf[N_DST]={0.f}, all_sum;
-    const int l0 = n*(2*ir + in);
+    const int step = sizeof(block_q4_K) * nb / 2;
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-    uchar2 sc1, sc2, sc3, sc4;
+    device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir;
-    for (int i = tpitg.x; i < nb; i += tptg.x) {
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
-        device const uint8_t * q1 = (x + i)->qs + q_offset;
+    for (int ib = ix; ib < nb; ib += 4) {
-        device const uint8_t * q2 = q1 + 64;
-        device const float   * y1 = yy + i*QK_K + y_offset;
-        device const float   * y2 = y1 + 128;
-        const float dall = (float)((x + i)->d);
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        const float dmin = (float)((x + i)->dmin);
+        for (int i = 0; i < 8; ++i) {
+            yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
+            yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
+            yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
+            yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
+        }
-        device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im;
-        sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
+        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
-        sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
+        device const half     * dh = &x[ib].d;
-        sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
-        sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
+        for (int row = 0; row < N_DST; row++) {
+            sc16[0] = sc[0] & kmask1;
+            sc16[1] = sc[2] & kmask1;
+            sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
+            sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
+            device const uint16_t * q2 = q1 + 32;
+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
+                acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
+                acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
+                acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
+                acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
+                acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
+                acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
+                acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
+            }
-        float4 s = {0.f, 0.f, 0.f, 0.f};
+            float dall = dh[0];
-        float smin = 0;
+            float dmin = dh[1];
-        for (int l = 0; l < n; ++l) {
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
+                                 (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
+                                 (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+            q1 += step;
+            sc += step;
+            dh += step;
+        }
-            s[0] += y1[l] * (q1[l] & 0xF); s[1] += y1[l+32] * (q1[l] >> 4);
+        y4 += 4 * QK_K;
-            s[2] += y2[l] * (q2[l] & 0xF); s[3] += y2[l+32] * (q2[l] >> 4);
+    }
-            smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0 + first_row + row] = all_sum;
        }
-        sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
    }
+}
 #else
-    uint16_t aux16[2];
+kernel void kernel_mul_mat_q4_K_f32(
-    thread const uint8_t * scales = (thread const uint8_t *)aux16;
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne10,
+        constant   int64_t & ne0,
+        constant   int64_t & ne01[[buffer(4)]],
+        uint2 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int il  = 4*tpitg.x;
+    const int ix = tiisg/4;  // 0...7
+    const int it = tiisg%4;  // 0...3
-    for (int i = tpitg.y; i < nb; i += tptg.y) {
+    const int nb = ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int ib_row = first_row * nb;
+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
+    device const float      * y = (device const float      *) src1 + r1*ne10;
+    float yl[8];
+    float yh[8];
+    float sumf[N_DST]={0.f}, all_sum;
-        device const uint8_t * q = x[i].qs + il;
+    const int step = sizeof(block_q4_K) * nb / 2;
-        device const float   * y = yy + i * QK_K + il;
-        const float d = (float)x[i].d[0];
+    device const float * y4 = y + ix * QK_K + 8 * it;
-        const float m = (float)x[i].d[1];
-        device const uint16_t * a = (device const uint16_t *)x[i].scales;
+    uint16_t sc16[4];
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        for (int l = 0; l < 4; ++l) {
+    for (int ib = ix; ib < nb; ib += 8) {
-            sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
-                  + d * scales[1] * (y[l+32] * (q[l] >>  4) + y[l+48] * (q[l+16] >>  4)) - m * scales[3] * (y[l+32] + y[l+48]);
+        float2 sumy = {0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
+            yh[i] = y4[i+32]; sumy[1] += yh[i];
        }
-    }
-#endif
-    sum[ith] = sumf;
+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
+        device const half     * dh = x[ib].d;
-    //
+        for (int row = 0; row < N_DST; row++) {
-    // Accumulate the sum from all threads in the threadgroup
-    // This version is slightly faster than the commented out one below,
+            sc16[0] = sc[0] & 0x000f;
-    // which I copy-pasted from ggerganov's q4_0 dot product for metal.
+            sc16[1] = sc[0] & 0x0f00;
-    //
+            sc16[2] = sc[0] & 0x00f0;
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+            sc16[3] = sc[0] & 0xf000;
-    if (ith%4 == 0) {
-        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
+            float2 acc1 = {0.f, 0.f};
-    }
+            float2 acc2 = {0.f, 0.f};
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+            for (int i = 0; i < 8; i += 2) {
-    if (ith%16 == 0) {
+                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
-        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
+                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
-    }
+                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
-    if (ith == 0) {
+            }
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
+            float dall = dh[0];
-    }
+            float dmin = dh[1];
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
+                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
+            qs += step;
+            sc += step;
+            dh += step;
+        }
-    //// accumulate the sum from all threads in the threadgroup
+        y4 += 8 * QK_K;
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
-    //for (uint i = nth/2; i > 0; i /= 2) {
-    //    if (ith < i) {
-    //        sum[ith] += sum[ith + i];
-    //    }
-    //    threadgroup_barrier(mem_flags::mem_threadgroup);
-    //}
-    //if (ith == 0) {
+    for (int row = 0; row < N_DST; ++row) {
-    //    dst[r1*ne0 + r0] = sum[0];
+        all_sum = simd_sum(sumf[row]);
-    //}
+        if (tiisg == 0) {
+            dst[r1*ne0 + first_row + row] = all_sum;
+        }
+    }
 }
+#endif
 kernel void kernel_mul_mat_q5_K_f32(
        device const  void * src0,
@@ -1629,39 +1668,39 @@ kernel void kernel_mul_mat_q5_K_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
+        uint tiisg[[thread_index_in_simdgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
    const int nb = ne00/QK_K;
    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;
-    device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
+    device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb;
    device const float     * yy = (device const float      *) src1 + r1*ne10;
-    const int nth = tptg.x*tptg.y;
+    float sumf[2]={0.f};
-    const int ith = tptg.y*tpitg.x + tpitg.y;
-    float sumf = 0;
+    const int step = sizeof(block_q5_K) * nb;
 #if QK_K == 256
+#
+    float yl[16], yh[16];
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
-    const int tid = tpitg.y;   // 0...16
+    const int tid = tiisg/4;
-    const int il  = tid/4;     // 0...3
+    const int ix  = tiisg%4;
-    const int ir  = tid - 4*il;// 0...3
+    const int im  = tid/4;
-    const int n   = 4;
+    const int ir  = tid%4;
+    const int n   = 8;
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-    const int l0 = n*(2*ir + in);
+    const int l0 = n*ir;
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;
@@ -1670,78 +1709,114 @@ kernel void kernel_mul_mat_q5_K_f32(
    const uint8_t hm3 = hm1 << 4;
    const uint8_t hm4 = hm2 << 4;
-    uchar2 sc1, sc2, sc3, sc4;
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
-    for (int i = tpitg.x; i < nb; i += tptg.x) {
+    device const float * y1 = yy + ix*QK_K + y_offset;
+    for (int i = ix; i < nb; i += 4) {
+        device const uint8_t * q1 = x[i].qs + q_offset;
+        device const uint8_t * qh = x[i].qh + l0;
+        device const half * dh = &x[i].d;
+        device const uint16_t * a = (device const uint16_t *)x[i].scales + im;
-        device const uint8_t * q1 = (x + i)->qs + q_offset;
+        device const float * y2 = y1 + 128;
-        device const uint8_t * q2 = q1 + 64;
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        device const uint8_t * qh = (x + i)->qh + l0;
+        for (int l = 0; l < 8; ++l) {
-        device const float   * y1 = yy + i*QK_K + y_offset;
+            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
-        device const float   * y2 = y1 + 128;
+            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
+            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
+            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
+        }
-        const float dall = (float)((x + i)->d);
+        for (int row = 0; row < 2; ++row) {
-        const float dmin = (float)((x + i)->dmin);
-        device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
+            device const uint8_t * q2 = q1 + 64;
-        sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
-        sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
-        sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
-        sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
-        float4 s = {0.f, 0.f, 0.f, 0.f};
+            sc16[0] = a[0] & kmask1;
-        float smin = 0;
+            sc16[1] = a[2] & kmask1;
-        for (int l = 0; l < n; ++l) {
+            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
+            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
-            s[0] += y1[l+ 0] * ((q1[l] & 0xF) + (qh[l] & hm1 ? 16 : 0));
+            float4 acc = {0.f, 0.f, 0.f, 0.f};
-            s[1] += y1[l+32] * ((q1[l] >>  4) + (qh[l] & hm2 ? 16 : 0));
+            for (int l = 0; l < n; ++l) {
-            s[2] += y2[l+ 0] * ((q2[l] & 0xF) + (qh[l] & hm3 ? 16 : 0));
+                uint8_t h = qh[l];
-            s[3] += y2[l+32] * ((q2[l] >>  4) + (qh[l] & hm4 ? 16 : 0));
+                acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0));
-            smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
+                acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0));
+                acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0));
+                acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0));
+            }
+            const float dall = dh[0];
+            const float dmin = dh[1];
+            sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) -
+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+            q1 += step;
+            qh += step;
+            dh += step/2;
+            a  += step/2;
        }
-        sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
+        y1 += 4 * QK_K;
    }
 #else
-    const int il  = 4 * tpitg.x;  // 0, 4, 8, 12
+    float yl[8], yh[8];
-    const int im  = il/8;         // 0, 0, 1, 1
-    const int in  = il%8;         // 0, 4, 0, 4
-    for (int i = tpitg.y; i < nb; i += tptg.y) {
+    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
+    const int ix = tiisg%8;
+    const int im = il/8;         // 0, 0, 1, 1
+    const int in = il%8;         // 0, 4, 0, 4
-        const float d = (float)x[i].d;
+    device const float * y = yy + ix*QK_K + il;
+    for (int i = ix; i < nb; i += 8) {
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int l = 0; l < 4; ++l) {
+            yl[l+0] = y[l+ 0];
+            yl[l+4] = y[l+16];
+            yh[l+0] = y[l+32];
+            yh[l+4] = y[l+48];
+        }
+        device const half * dh = &x[i].d;
        device const uint8_t * q = x[i].qs + il;
        device const uint8_t * h = x[i].qh + in;
        device const int8_t  * s = x[i].scales;
-        device const float   * y = yy + i*QK_K + il;
-        for (int l = 0; l < 4; ++l) {
+        for (int row = 0; row < 2; ++row) {
-            const uint8_t hl = h[l] >> im;
-            sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
+            const float d = dh[0];
-                  + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
-                  + y[l+32] * d * s[2] * ((q[l+ 0] >>  4) - (hl & 0x10 ? 0 : 16))
+            float2 acc = {0.f, 0.f};
-                  + y[l+48] * d * s[3] * ((q[l+16] >>  4) - (hl & 0x40 ? 0 : 16));
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t hl = h[l] >> im;
+                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
+                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
+                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
+                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
+            }
+            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
+            q += step;
+            h += step;
+            s += step;
+            dh += step/2;
        }
+        y += 8 * QK_K;
    }
 #endif
-    sum[ith] = sumf;
-    //
+    for (int row = 0; row < 2; ++row) {
-    // Accumulate the sum from all threads in the threadgroup
+        const float tot = simd_sum(sumf[row]);
-    //
+        if (tiisg == 0) {
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+            dst[r1*ne0 + first_row + row] = tot;
-    if (ith%4 == 0) {
+        }
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
    }
 }
@@ -1753,10 +1828,9 @@ kernel void kernel_mul_mat_q6_K_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
+        uint tiisg[[thread_index_in_simdgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
    const uint8_t kmask1 = 0x03;
    const uint8_t kmask2 = 0x0C;
@@ -1768,19 +1842,18 @@ kernel void kernel_mul_mat_q6_K_f32(
    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;
-    device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
+    const int row = 2 * r0 + sgitg;
-    device const float     * yy = (device const float      *) src1 + r1*ne10;
-    const int nth = tptg.x*tptg.y;
+    device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb;
-    const int ith = tptg.y*tpitg.x + tpitg.y;
+    device const float     * yy = (device const float      *) src1 + r1*ne10;
    float sumf = 0;
 #if QK_K == 256
-    // Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
+    const int tid  = tiisg/2;
-    const int iqs  = 16 * tpitg.y;
+    const int ix   = tiisg%2;
-    const int ip   = iqs / 128;         // 0 or 1
+    const int ip   = tid/8;         // 0 or 1
-    const int il   = (iqs - 128*ip)/16; // 0...7
+    const int il   = tid%8;
    const int n    = 4;
    const int l0   = n*il;
    const int is   = 8*ip + l0/16;
@@ -1789,9 +1862,10 @@ kernel void kernel_mul_mat_q6_K_f32(
    const int q_offset_l = 64*ip + l0;
    const int q_offset_h = 32*ip + l0;
-    for (int i = tpitg.x; i < nb; i += tptg.x) {
+    for (int i = ix; i < nb; i += 2) {
-        device const uint8_t * ql = x[i].ql + q_offset_l;
+        device const uint8_t * q1 = x[i].ql + q_offset_l;
+        device const uint8_t * q2 = q1 + 32;
        device const uint8_t * qh = x[i].qh + q_offset_h;
        device const int8_t  * sc = x[i].scales + is;
@@ -1801,19 +1875,21 @@ kernel void kernel_mul_mat_q6_K_f32(
        float4 sums = {0.f, 0.f, 0.f, 0.f};
        for (int l = 0; l < n; ++l) {
-            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
+            sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
-            sums[1] += y[l+32] * ((int8_t)((ql[l+32] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
+            sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
-            sums[2] += y[l+64] * ((int8_t)((ql[l+ 0]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
+            sums[2] += y[l+64] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
-            sums[3] += y[l+96] * ((int8_t)((ql[l+32]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
+            sums[3] += y[l+96] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
        }
        sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
    }
 #else
-    const int il  = 4*tpitg.x;    // 0, 4, 8, 12
+    const int ix  = tiisg/4;
+    const int il  = 4*(tiisg%4);
-    for (int i = tpitg.y; i < nb; i += tptg.y) {
+    for (int i = ix; i < nb; i += 8) {
        device const float * y = yy + i * QK_K + il;
        device const uint8_t * ql = x[i].ql + il;
        device const uint8_t * qh = x[i].qh + il;
@@ -1833,23 +1909,8 @@ kernel void kernel_mul_mat_q6_K_f32(
 #endif
-    sum[ith] = sumf;
+    const float tot = simd_sum(sumf);
+    if (tiisg == 0) {
-    //
+        dst[r1*ne0 + row] = tot;
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
-    }
 }
--- a/llama/ggml.c
+++ b/llama/ggml.c
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -51,16 +51,23 @@
 #include <float.h>
 #include <limits.h>
 #include <stdarg.h>
+#include <signal.h>
 #ifdef GGML_USE_METAL
 #include <unistd.h>
 #endif
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
+#endif
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
@@ -75,23 +82,23 @@
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
-static void atomic_store(atomic_int* ptr, LONG val) {
+static void atomic_store(atomic_int * ptr, LONG val) {
    InterlockedExchange(ptr, val);
 }
-static LONG atomic_load(atomic_int* ptr) {
+static LONG atomic_load(atomic_int * ptr) {
    return InterlockedCompareExchange(ptr, 0, 0);
 }
-static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
    return InterlockedExchangeAdd(ptr, inc);
 }
-static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
+static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
    return atomic_fetch_add(ptr, -(dec));
 }
 typedef HANDLE pthread_t;
 typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
+static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
    (void) unused;
    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
    if (handle == NULL)
@@ -103,7 +110,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
    return 0;
 }
-static int pthread_join(pthread_t thread, void* unused) {
+static int pthread_join(pthread_t thread, void * unused) {
    (void) unused;
    return (int) WaitForSingleObject(thread, INFINITE);
 }
@@ -116,7 +123,7 @@ static int sched_yield (void) {
 #include <pthread.h>
 #include <stdatomic.h>
-typedef void* thread_ret_t;
+typedef void * thread_ret_t;
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -137,10 +144,6 @@ typedef void* thread_ret_t;
 #endif
 #endif
-#ifdef __HAIKU__
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#endif
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
@@ -3812,6 +3815,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CLAMP",
    "CONV_1D",
    "CONV_2D",
+    "POOL_1D",
+    "POOL_2D",
    "FLASH_ATTN",
    "FLASH_FF",
@@ -3830,7 +3835,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@@ -3890,6 +3895,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "clamp(x)",
    "conv_1d(x)",
    "conv_2d(x)",
+    "pool_1d(x)",
+    "pool_2d(x)",
    "flash_attn(x)",
    "flash_ff(x)",
@@ -3908,7 +3915,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };
-static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4187,10 +4196,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
 static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-    return
+    return (t0->ne[0]           == t1->ne[0])  &&
-        (t0->ne[0] == t1->ne[0])  &&
+           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
-        (t0->ne[2] == t1->ne[2])  &&
+           (t1->ne[3]%t0->ne[3] == 0);
-        (t0->ne[3] == t1->ne[3]);
 }
 static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
@@ -4430,8 +4438,8 @@ void ggml_free(struct ggml_context * ctx) {
        if (&g_state.contexts[i].context == ctx) {
            g_state.contexts[i].used = false;
-            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
+            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
+                    __func__, i, ggml_used_mem(ctx));
            if (ctx->mem_buffer_owned) {
                GGML_ALIGNED_FREE(ctx->mem_buffer);
@@ -4749,7 +4757,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
            {
                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
                }
            } break;
        case GGML_TYPE_F32:
@@ -4801,7 +4809,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
            {
                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
                }
            } break;
        case GGML_TYPE_F32:
@@ -5061,11 +5069,15 @@ struct ggml_tensor * ggml_add_impl(
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
+    // TODO: support less-strict constraint
+    //       GGML_ASSERT(ggml_can_repeat(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
    bool is_node = false;
-    if (a->grad || b->grad) {
+    if (!inplace && (a->grad || b->grad)) {
+        // TODO: support backward pass for broadcasting
+        GGML_ASSERT(ggml_are_same_shape(a, b));
        is_node = true;
    }
@@ -6051,8 +6063,8 @@ struct ggml_tensor * ggml_mul_mat(
        is_node = true;
    }
-    const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
+    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
    result->op   = GGML_OP_MUL_MAT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6970,6 +6982,8 @@ struct ggml_tensor * ggml_rope_impl(
        int                   n_past,
        int                   n_dims,
        int                   mode,
+        float                 freq_base,
+        float                 freq_scale,
        int                   n_ctx,
        bool                  inplace) {
    GGML_ASSERT(n_past >= 0);
@@ -6983,12 +6997,14 @@ struct ggml_tensor * ggml_rope_impl(
    ggml_scratch_save(ctx);
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
    ((int32_t *) b->data)[0] = n_past;
    ((int32_t *) b->data)[1] = n_dims;
    ((int32_t *) b->data)[2] = mode;
    ((int32_t *) b->data)[3] = n_ctx;
+    memcpy((int32_t *) b->data + 4, &freq_base,  sizeof(float));
+    memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
    ggml_scratch_load(ctx);
@@ -7007,7 +7023,7 @@ struct ggml_tensor * ggml_rope(
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
 }
 struct ggml_tensor * ggml_rope_inplace(
@@ -7017,7 +7033,19 @@ struct ggml_tensor * ggml_rope_inplace(
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
+}
+struct ggml_tensor * ggml_rope_custom_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode,
+        float                 freq_base,
+        float                 freq_scale,
+        int                   n_ctx) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
 }
 // ggml_rope_back
@@ -7188,7 +7216,6 @@ struct ggml_tensor* ggml_conv_2d(
    int                  d0,
    int                  d1) {
-    GGML_ASSERT(b->ne[3] == 1);
    GGML_ASSERT(a->ne[2] == b->ne[2]);
    bool is_node = false;
@@ -7200,7 +7227,7 @@ struct ggml_tensor* ggml_conv_2d(
    const int64_t ne[4] = {
        ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
        ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
-        a->ne[3], 1,
+        a->ne[3], b->ne[3],
    };
    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
@@ -7235,6 +7262,98 @@ struct ggml_tensor* ggml_conv_1d_ph(
    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
+// ggml_pool_*
+static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
+    return (ins + 2 * p - ks) / s + 1;
+}
+// ggml_pool_2d
+struct ggml_tensor* ggml_pool_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   s0,
+        int                   p0) {
+    bool is_node = false;
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+    const int64_t ne[3] = {
+        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        a->ne[1],
+    };
+    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+    ggml_scratch_save(ctx);
+    struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+    ((int32_t*)c->data)[0] = op;
+    ((int32_t*)c->data)[1] = k0;
+    ((int32_t*)c->data)[2] = s0;
+    ((int32_t*)c->data)[3] = p0;
+    ggml_scratch_load(ctx);
+    result->op = GGML_OP_POOL_1D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = c;
+    return result;
+}
+// ggml_pool_2d
+struct ggml_tensor* ggml_pool_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   k1,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1) {
+    bool is_node = false;
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+    const int64_t ne[3] = {
+        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
+        a->ne[2],
+    };
+    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+    ggml_scratch_save(ctx);
+    struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
+    ((int32_t*)c->data)[0] = op;
+    ((int32_t*)c->data)[1] = k0;
+    ((int32_t*)c->data)[2] = k1;
+    ((int32_t*)c->data)[3] = s0;
+    ((int32_t*)c->data)[4] = s1;
+    ((int32_t*)c->data)[5] = p0;
+    ((int32_t*)c->data)[6] = p1;
+    ggml_scratch_load(ctx);
+    result->op = GGML_OP_POOL_2D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = c;
+    return result;
+}
 // ggml_flash_attn
 struct ggml_tensor * ggml_flash_attn(
@@ -8323,7 +8442,7 @@ static void ggml_compute_forward_add_f32(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
@@ -8348,23 +8467,23 @@ static void ggml_compute_forward_add_f32(
    if (nb10 == sizeof(float)) {
        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int i3 = ir/(ne2*ne1);
+            const int64_t i03 = ir/(ne02*ne01);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 #ifdef GGML_USE_ACCELERATE
-            vDSP_vadd(
+            vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                    ne0);
 #else
-            ggml_vec_add_f32(ne0,
+            ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
 #endif
                // }
            // }
@@ -8372,15 +8491,20 @@ static void ggml_compute_forward_add_f32(
    } else {
        // src1 is not contiguous
        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int i3 = ir/(ne2*ne1);
+            const int64_t i03 = ir/(ne02*ne01);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
            for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
            }
@@ -10559,7 +10683,6 @@ static void ggml_compute_forward_rms_norm_back(
    }
 }
 // ggml_compute_forward_mul_mat
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@@ -10603,17 +10726,19 @@ static void ggml_compute_forward_mul_mat(
    const int ith = params->ith;
    const int nth = params->nth;
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
    const enum ggml_type type = src0->type;
+    const bool src1_cont = ggml_is_contiguous(src1);
    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
    GGML_ASSERT(nb10 == sizeof(float));
@@ -10624,16 +10749,16 @@ static void ggml_compute_forward_mul_mat(
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
 #if defined(GGML_USE_CLBLAST)
    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
+        //       ref: https://github.com/ggerganov/ggml/pull/224
+        GGML_ASSERT(ne02 == ne12);
+        GGML_ASSERT(ne03 == ne13);
        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
        }
@@ -10643,6 +10768,11 @@ static void ggml_compute_forward_mul_mat(
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+        // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
+        //       ref: https://github.com/ggerganov/ggml/pull/224
+        GGML_ASSERT(ne02 == ne12);
+        GGML_ASSERT(ne03 == ne13);
        if (params->ith != 0) {
            return;
        }
@@ -10663,7 +10793,7 @@ static void ggml_compute_forward_mul_mat(
                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
                if (type != GGML_TYPE_F32) {
-                    float * const wdata = params->wdata;
+                            float * const wdata    = params->wdata;
                    ggml_to_float_t const to_float = type_traits[type].to_float;
                    size_t id = 0;
@@ -10712,41 +10842,52 @@ static void ggml_compute_forward_mul_mat(
        return;
    }
-    // parallelize by src0 rows using ggml_vec_dot_q
+    // parallelize by src0 rows
+    const int64_t dr = (ne01 + nth - 1)/nth;
-    // total rows in src0
+    const int64_t ir10 = dr*ith;
-    const int nr = ne01*ne02*ne03;
+    const int64_t ir11 = MIN(ir10 + dr, ne01);
-    // rows per thread
+    // src1 rows
-    const int dr = (nr + nth - 1)/nth;
+    const int64_t nr1 = ne11*ne12*ne13;
-    // row range for this thread
+    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const int ir0 = dr*ith;
+    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
-    const int ir1 = MIN(ir0 + dr, nr);
-    void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
-    const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+        const int64_t i13 = (ir1/(ne12*ne11));
+        const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
+        const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
-    for (int ir = ir0; ir < ir1; ++ir) {
+        const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
-        // src0 indices
+        const int64_t i03 = (ir0/(ne02));
-        const int i03 = ir/(ne02*ne01);
+        // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+        // GG: this is likely the correct way to broadcast, though need some more thought
+        //     therefore leaving the comments to remind us for now
+        const int64_t i02 = (i12 / (ne12 / ne02));
+        // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+        // const int64_t i02 = (ir0 - i03*ne02);
-        const int i13 = i03;
+        const int64_t i1 = i11;
-        const int i12 = i02;
+        const int64_t i2 = i12;
+        const int64_t i3 = i13;
-        const int i0 = i01;
+        const char * src0_row = (const char *) src0->data + (  0 + i02*nb02 + i03*nb03     );
-        const int i2 = i02;
-        const int i3 = i03;
-        void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-        char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*row_size));
+        //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+        //       the original src1 data pointer, so we should index using the indices directly
+        // TODO: this is a bit of a hack, we should probably have a better way to handle this
+        const char * src1_col = (const char *) wdata +
+            (src1_cont || src1->type != vec_dot_type
+             ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+             : (i11*nb11 + i12*nb12 + i13*nb13));
-        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+        float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
-        for (int64_t ic = 0; ic < ne11; ++ic) {
+        for (int64_t ir = ir10; ir < ir11; ++ir) {
-            vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
+            vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
        }
    }
@@ -11743,7 +11884,7 @@ static void ggml_compute_forward_alibi_f32(
    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
    const int ne1 = src0->ne[1]; // seq_len_without_past
-    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    const int ne2 = src0->ne[2]; // n_head -> this is k
    //const int ne3 = src0->ne[3]; // 1 -> bsz
    const int n  = ggml_nrows(src0);
@@ -11754,8 +11895,9 @@ static void ggml_compute_forward_alibi_f32(
    const int nb2 = src0->nb[2];
    //const int nb3 = src0->nb[3];
-    assert(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 == sizeof(float));
-    assert(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(ne1 + n_past == ne0);
+    GGML_ASSERT(n_head == ne2);
    // add alibi to src0 (KQ_scaled)
    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -11779,7 +11921,7 @@ static void ggml_compute_forward_alibi_f32(
                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
                }
-                pdst[0] = (i-ne0+1) * m_k + src[0];
+                pdst[0] = i * m_k + src[0];
            }
        }
@@ -11808,7 +11950,7 @@ static void ggml_compute_forward_alibi_f16(
    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
    const int ne1 = src0->ne[1]; // seq_len_without_past
-    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    const int ne2 = src0->ne[2]; // n_head -> this is k
    //const int ne3 = src0->ne[3]; // 1 -> bsz
    const int n  = ggml_nrows(src0);
@@ -11819,8 +11961,9 @@ static void ggml_compute_forward_alibi_f16(
    const int nb2 = src0->nb[2];
    //const int nb3 = src0->nb[3];
-    assert(nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-    assert(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(n_head == ne2);
    // add alibi to src0 (KQ_scaled)
    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -11845,7 +11988,7 @@ static void ggml_compute_forward_alibi_f16(
                }
                // we return F32
-                pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
+                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
            }
        }
    }
@@ -11973,16 +12116,21 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
+    float freq_base;
+    float freq_scale;
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
+    memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
    assert(n_past >= 0);
@@ -12011,7 +12159,7 @@ static void ggml_compute_forward_rope_f32(
    // row index used to determine which thread to use
    int ir = 0;
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
@@ -12023,7 +12171,7 @@ static void ggml_compute_forward_rope_f32(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
                if (is_glm) {
                    theta = MIN(p, n_ctx - 2);
@@ -12100,16 +12248,21 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
+    float freq_base;
+    float freq_scale;
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
+    memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
    assert(n_past >= 0);
@@ -12138,7 +12291,7 @@ static void ggml_compute_forward_rope_f16(
    // row index used to determine which thread to use
    int ir = 0;
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
@@ -12150,7 +12303,7 @@ static void ggml_compute_forward_rope_f16(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
                if (is_glm) {
                    theta = MIN(p, n_ctx - 2);
@@ -12211,7 +12364,7 @@ static void ggml_compute_forward_rope_f16(
                            const float x0 = GGML_FP16_TO_FP32(src[0]);
                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-                            dst_data[0]     = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                        }
                    }
@@ -12893,12 +13046,13 @@ static void ggml_compute_forward_conv_1d(
    };
 }
-// ggml_compute_forward_conv_2d_sk_p0
+// ggml_compute_forward_conv_2d
-static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
+static void ggml_compute_forward_conv_2d_f16_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt0,
              struct ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -12918,11 +13072,17 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
    // size of the convolution row - the kernel size unrolled across all channels
    const int ew0 = nk0*nk1*ne02;
+    const int32_t s0 = ((const int32_t*)(opt0->data))[0];
+    const int32_t s1 = ((const int32_t*)(opt0->data))[1];
+    const int32_t p0 = ((const int32_t*)(opt0->data))[2];
+    const int32_t p1 = ((const int32_t*)(opt0->data))[3];
+    const int32_t d0 = ((const int32_t*)(opt0->data))[4];
+    const int32_t d1 = ((const int32_t*)(opt0->data))[5];
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));
    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
        memset(params->wdata, 0, params->wsize);
        // prepare source data (src1)
@@ -12937,8 +13097,13 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
                    for (int i0 = 0; i0 < ne0; i0++) {
                        for (int ik1 = 0; ik1 < nk1; ik1++) {
                            for (int ik0 = 0; ik0 < nk0; ik0++) {
-                                dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+                                const int idx0 = i0*s0 + ik0*d0 - p0;
-                                    GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
+                                const int idx1 = i1*s1 + ik1*d1 - p1;
+                                if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
+                                    dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+                                        GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+                                }
                            }
                        }
                    }
@@ -12965,32 +13130,36 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-    for (int i2 = ip0; i2 < ip1; i2++) {
+    for (int i3 = 0; i3 < ne3; i3++) {
-        float * dst_data = (float *)((char *) dst->data + i2*nb2);
+        for (int i2 = ip0; i2 < ip1; i2++) {
+            float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
-        for (int i1 = 0; i1 < ne1; ++i1) {
-            for (int i0 = 0; i0 < ne0; ++i0) {
+            for (int i1 = 0; i1 < ne1; ++i1) {
-                ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
+                for (int i0 = 0; i0 < ne0; ++i0) {
-                        (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
+                    ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
-                        (ggml_fp16_t *)                wdata + (i1*ne0 + i0)*ew0);
+                            (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
+                            (ggml_fp16_t *)                wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
+                }
            }
        }
    }
 }
-static void ggml_compute_forward_conv_2d_sk_p0(
+static void ggml_compute_forward_conv_2d(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        const struct ggml_tensor * opt0,
+        struct ggml_tensor * dst
+        ) {
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
            } break;
        case GGML_TYPE_F32:
            {
-                //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
+                //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
                GGML_ASSERT(false);
            } break;
        default:
@@ -13000,31 +13169,164 @@ static void ggml_compute_forward_conv_2d_sk_p0(
    }
 }
-// ggml_compute_forward_conv_2d
+// ggml_compute_forward_pool_1d_sk_p0
-static void ggml_compute_forward_conv_2d(
+static void ggml_compute_forward_pool_1d_sk_p0(
+        const struct ggml_compute_params * params,
+        const enum ggml_op_pool op,
+        const struct ggml_tensor * src,
+        const int k,
+        struct ggml_tensor * dst) {
+    assert(src->type == GGML_TYPE_F32);
+    assert(params->ith == 0);
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+    const char * cdata = (const char *)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+    float * drow = (float *)dst->data;
+    const int64_t rs = dst->ne[0];
+    while (cdata < data_end) {
+        const float * const srow = (const float *)cdata;
+        int j = 0;
+        for (int64_t i = 0; i < rs; ++i) {
+            switch (op) {
+                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
+                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
+                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+            }
+            for (int ki = 0; ki < k; ++ki) {
+                switch (op) {
+                    case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
+                    case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
+                    case GGML_OP_POOL_COUNT:                        GGML_ASSERT(false); break;
+                }
+                ++j;
+            }
+            switch (op) {
+                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
+                case GGML_OP_POOL_MAX:                       break;
+                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+            }
+        }
+        cdata += src->nb[1];
+        drow  += rs;
+    }
+}
+// ggml_compute_forward_pool_1d
+static void ggml_compute_forward_pool_1d(
    const struct ggml_compute_params* params,
    const struct ggml_tensor* src0,
-    const struct ggml_tensor* src1,
    const struct ggml_tensor* opt0,
    struct ggml_tensor* dst) {
-    const int32_t s0 = ((const int32_t*)(opt0->data))[0];
+    GGML_ASSERT(opt0->ne[0] == 4);
-    const int32_t s1 = ((const int32_t*)(opt0->data))[1];
+    const int* opts = (const int*)opt0->data;
-    const int32_t p0 = ((const int32_t*)(opt0->data))[2];
+    enum ggml_op_pool op = opts[0];
-    const int32_t p1 = ((const int32_t*)(opt0->data))[3];
+    const int k0 = opts[1];
-    const int32_t d0 = ((const int32_t*)(opt0->data))[4];
+    const int s0 = opts[2];
-    const int32_t d1 = ((const int32_t*)(opt0->data))[5];
+    const int p0 = opts[3];
-    GGML_ASSERT(d0 == 1); // dilation not supported
-    GGML_ASSERT(d1 == 1);
    GGML_ASSERT(p0 == 0); // padding not supported
-    GGML_ASSERT(p1 == 0);
+    GGML_ASSERT(k0 == s0); // only s = k supported
-    if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
+    ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
-        ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
+}
+// ggml_compute_forward_pool_2d_sk_p0
+static void ggml_compute_forward_pool_2d_sk_p0(
+    const struct ggml_compute_params * params,
+    const enum ggml_op_pool op,
+    const struct ggml_tensor * src,
+    const int k0,
+    const int k1,
+    struct ggml_tensor * dst) {
+    assert(src->type == GGML_TYPE_F32);
+    assert(params->ith == 0);
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
    }
-    else {
-        GGML_ASSERT(false); // only stride equal to kernel size is supported
+    const char * cdata = (const char*)src->data;
-    };
+    const char * const data_end = cdata + ggml_nbytes(src);
+    const int64_t px = dst->ne[0];
+    const int64_t py = dst->ne[1];
+    const int64_t pa = px * py;
+    float * dplane = (float *)dst->data;
+    const int ka = k0 * k1;
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            float * const drow = dplane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                float * const out =  drow + ox;
+                switch (op) {
+                    case GGML_OP_POOL_AVG:     *out = 0;        break;
+                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
+                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                }
+                const int ix = ox * k0;
+                const int iy = oy * k1;
+                for (int ky = 0; ky < k1; ++ky) {
+                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
+                    for (int kx = 0; kx < k0; ++kx) {
+                        int j = ix + kx;
+                        switch (op) {
+                            case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
+                            case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
+                            case GGML_OP_POOL_COUNT:                GGML_ASSERT(false); break;
+                        }
+                    }
+                }
+                switch (op) {
+                    case GGML_OP_POOL_AVG:           *out /= ka; break;
+                    case GGML_OP_POOL_MAX:                       break;
+                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                }
+            }
+        }
+        cdata  += src->nb[2];
+        dplane += pa;
+    }
+}
+// ggml_compute_forward_pool_2d
+static void ggml_compute_forward_pool_2d(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+    const struct ggml_tensor * opt0,
+    struct ggml_tensor * dst) {
+    GGML_ASSERT(opt0->ne[0] == 7);
+    const int* opts = (const int*)opt0->data;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(p1 == 0); // padding not supported
+    GGML_ASSERT(k0 == s0);
+    GGML_ASSERT(k1 == s1); // only s = k supported
+    ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
 }
@@ -14808,6 +15110,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
            } break;
+        case GGML_OP_POOL_1D:
+            {
+                ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_POOL_2D:
+            {
+                ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
        case GGML_OP_FLASH_ATTN:
            {
                const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
@@ -15452,7 +15762,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                // necessary for llama
                if (src0->grad) {
                    assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 4);
+                    assert(ggml_nelements(src1) == 6);
                    const int n_past = ((int32_t *) src1->data)[0];
                    const int n_dims = ((int32_t *) src1->data)[1];
                    const int mode   = ((int32_t *) src1->data)[2];
@@ -15473,7 +15783,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                if (src0->grad) {
                    assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 4);
+                    assert(ggml_nelements(src1) == 3);
                    const int n_past = ((int32_t *) src1->data)[0];
                    const int n_dims = ((int32_t *) src1->data)[1];
                    const int mode   = ((int32_t *) src1->data)[2];
@@ -15508,6 +15818,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
+        case GGML_OP_POOL_1D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_POOL_2D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
        case GGML_OP_FLASH_ATTN:
            {
                struct ggml_tensor * flash_grad = NULL;
@@ -15970,6 +16288,9 @@ struct ggml_compute_state_shared {
    // synchronization primitives
    atomic_int n_active; // num active threads
    atomic_int node_n;   // active graph node
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
 };
 struct ggml_compute_state {
@@ -16001,6 +16322,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    int node_n = -1;
    while (true) {
+        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+            state->shared->node_n += 1;
+            return (thread_ret_t) GGML_EXIT_ABORTED;
+        }
        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            // all other threads are finished and spinning
            // do finalize and init here so we don't have synchronize again
@@ -16018,8 +16343,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                if (GGML_OP_HAS_FINALIZE[node->op]) {
                    params.nth = n_tasks_arr[node_n];
                    ggml_compute_forward(&params, node);
-                    ggml_graph_compute_perf_stats_node(node, state->shared);
                }
+                ggml_graph_compute_perf_stats_node(node, state->shared);
            }
            // distribute new work or execute it direct if 1T
@@ -16049,11 +16374,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                    if (GGML_OP_HAS_FINALIZE[node->op]) {
                        params.type = GGML_TASK_FINALIZE;
                        ggml_compute_forward(&params, node);
-                        ggml_graph_compute_perf_stats_node(node, state->shared);
                    }
+                    ggml_graph_compute_perf_stats_node(node, state->shared);
                } else {
                    break;
                }
+                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+                    break;
+                }
            }
            atomic_store(&state->shared->n_active, n_threads);
@@ -16087,7 +16417,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        }
    }
-    return 0;
+    return GGML_EXIT_SUCCESS;
 }
 struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
@@ -16287,8 +16617,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                {
                    n_tasks = n_threads;
-                    GGML_ASSERT(node->src[1]->ne[3] == 1);
                    const int64_t ne00 = node->src[0]->ne[0]; // W
                    const int64_t ne01 = node->src[0]->ne[1]; // H
                    const int64_t ne02 = node->src[0]->ne[2]; // C
@@ -16298,19 +16626,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    const int64_t ne11 = node->src[1]->ne[1]; // H
                    const int64_t ne12 = node->src[1]->ne[2]; // C
+                    const int64_t ne0 = node->ne[0];
+                    const int64_t ne1 = node->ne[1];
+                    const int64_t ne2 = node->ne[2];
                    const int64_t nk = ne00*ne01;
+                    const int64_t ew0 = nk * ne02;
-                    UNUSED(ne02);
                    UNUSED(ne03);
-                    UNUSED(nk);
+                    UNUSED(ne2);
                    size_t cur = 0;
                    if (node->src[0]->type == GGML_TYPE_F16 &&
-                            node->src[1]->type == GGML_TYPE_F32) {
+                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
+                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                            node->src[1]->type == GGML_TYPE_F32) {
+                               node->src[1]->type == GGML_TYPE_F32) {
                        cur = sizeof(float)*      (ne10*ne11*ne12);
                    } else {
                        GGML_ASSERT(false);
@@ -16318,6 +16649,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    work_size = MAX(work_size, cur);
                } break;
+            case GGML_OP_POOL_1D:
+            case GGML_OP_POOL_2D:
+                {
+                    n_tasks = 1;
+                } break;
            case GGML_OP_FLASH_ATTN:
                {
                    n_tasks = n_threads;
@@ -16427,7 +16763,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
    return cplan;
 }
-void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    {
        GGML_ASSERT(cplan);
        GGML_ASSERT(cplan->n_threads > 0);
@@ -16453,6 +16789,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
        /*.n_threads               =*/ n_threads,
        /*.n_active                =*/ n_threads,
        /*.node_n                  =*/ -1,
+        /*.abort_callback          =*/ NULL,
+        /*.abort_callback_data     =*/ NULL,
    };
    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
@@ -16476,12 +16814,12 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
    const int64_t perf_start_time_us = ggml_perf_time_us();
    // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
+    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
    // don't leave affinity set on the main thread
    clear_numa_thread_affinity();
-    // join thread pool
+    // join or kill thread pool
    if (n_threads > 1) {
        for (int j = 1; j < n_threads; j++) {
            const int rc = ggml_thread_join(workers[j].thrd, NULL);
@@ -16505,6 +16843,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
                (double) perf_time_us_cur     / 1000.0,
                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
    }
+    return compute_status;
 }
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
@@ -16578,9 +16918,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
 }
 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    //assert(cgraph->work      == NULL);
-    //assert(cgraph->work_size == 0);
    uint64_t size_eval = 0;
    // compute size of intermediate results
@@ -17019,9 +17356,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    GGML_PRINT("=== GRAPH ===\n");
-    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];

--- a/llama/ggml.h
+++ b/llama/ggml.h
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -227,8 +227,13 @@
 #define GGML_MAX_NAME          48
 #define GGML_DEFAULT_N_THREADS 4
+#define GGML_EXIT_SUCCESS 0
+#define GGML_EXIT_ABORTED 1
 #define GGML_UNUSED(x) (void)(x)
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
@@ -389,6 +394,8 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
+        GGML_OP_POOL_1D,
+        GGML_OP_POOL_2D,
        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
@@ -468,6 +475,10 @@ extern "C" {
        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
        int n_tasks[GGML_MAX_NODES];
+        // abort ggml_graph_compute when true
+        bool (*abort_callback)(void * data);
+        void * abort_callback_data;
    };
    // computation graph
@@ -1136,6 +1147,17 @@ extern "C" {
            int                   mode,
            int                   n_ctx);
+    // custom RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            float                 freq_base,
+            float                 freq_scale,
+            int                   n_ctx);
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1190,6 +1212,31 @@ extern "C" {
            int                   s,
            int                   d);
+    enum ggml_op_pool {
+        GGML_OP_POOL_MAX,
+        GGML_OP_POOL_AVG,
+        GGML_OP_POOL_COUNT,
+    };
+    GGML_API struct ggml_tensor* ggml_pool_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0, // kernel size
+            int                   s0, // stride
+            int                   p0); // padding
+    GGML_API struct ggml_tensor* ggml_pool_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1);
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -1329,7 +1376,7 @@ extern "C" {
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
    // same as ggml_graph_compute() but the work data is allocated as a part of the context

--- a/llama/k_quants.c
+++ b/llama/k_quants.c
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *

--- a/llama/k_quants.h
+++ b/llama/k_quants.h
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -41,6 +41,14 @@
 #define K_SCALE_SIZE 12
 #endif
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
 //
 // Super-block quantization structures
 //

--- a/llama/llama-util.h
+++ b/llama/llama-util.h
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -201,13 +201,13 @@ struct llama_mmap {
    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
        size = file->size;
        int fd = fileno(file->fp);
-        int flags = MAP_PRIVATE;
+        int flags = MAP_SHARED;
        // prefetch/readahead impairs performance on NUMA systems
        if (numa) { prefetch = 0; }
 #ifdef __linux__
        if (prefetch) { flags |= MAP_POPULATE; }
 #endif
-        addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        if (addr == MAP_FAILED) {
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }
@@ -249,7 +249,7 @@ struct llama_mmap {
            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
        }
-        addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
        error = GetLastError();
        CloseHandle(hMapping);

--- a/llama/llama.cpp
+++ b/llama/llama.cpp
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 // memory sizes
 //
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
 {
    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,    256ull * MB },
+        /* empirical scaling, still a guess */
-        { MODEL_7B,    512ull * MB },
+        { MODEL_3B,   ((size_t) n_ctx / 16ull + 128ull) * MB },
-        { MODEL_13B,   512ull * MB },
+        { MODEL_7B,   ((size_t) n_ctx / 16ull + 256ull) * MB },
-        { MODEL_30B,   512ull * MB },
+        { MODEL_13B,  ((size_t) n_ctx / 12ull + 256ull) * MB },
-        { MODEL_65B,  1024ull * MB },
+        { MODEL_30B,  ((size_t) n_ctx / 10ull + 256ull) * MB },
+        { MODEL_65B,  ((size_t) n_ctx /  8ull + 512ull) * MB },
    };
    return k_sizes;
 }
@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
 {
    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,   512ull * MB },
+        { MODEL_3B,  ((size_t) n_ctx / 256ull +  512ull) * MB },
-        { MODEL_7B,   768ull * MB },
+        { MODEL_7B,  ((size_t) n_ctx / 256ull +  768ull) * MB },
-        { MODEL_13B, 1024ull * MB },
+        { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
-        { MODEL_30B, 1280ull * MB },
+        { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
-        { MODEL_65B, 1536ull * MB },
+        { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
    };
    return k_sizes;
 }
@@ -215,6 +216,10 @@ struct llama_hparams {
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
    uint32_t n_rot   = 64;
+    float rope_freq_base  = 10000.0f;
+    float rope_freq_scale = 1.0f;
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
    bool operator!=(const llama_hparams & other) const {
@@ -329,7 +334,7 @@ struct llama_model {
 };
 struct llama_context {
-    llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
+    llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
 #ifdef GGML_USE_METAL
    ~llama_context() {
        if (ctx_metal) {
@@ -350,7 +355,6 @@ struct llama_context {
    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
    const llama_model & model;
-    const llama_vocab & vocab;
    bool model_owner = false;
@@ -577,7 +581,9 @@ struct llama_file_loader {
            }
            // skip to the next multiple of 32 bytes
-            file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+            if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
+                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+            }
            tensor.file_off = file.tell();
            tensor.name = name;
@@ -674,7 +680,7 @@ struct llama_model_loader {
        *ctx_size_p = *mmapped_size_p = 0;
        for (const llama_load_tensor & lt : tensors_map.tensors) {
            *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
+            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
        }
    }
@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ {0},
+        /*.rope_freq_base              =*/ 10000.0f,
+        /*.rope_freq_scale             =*/ 1.0f,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.low_vram                    =*/ false,
@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
    return result;
 }
+int llama_max_devices() {
+    return LLAMA_MAX_DEVICES;
+}
 bool llama_mmap_supported() {
    return llama_mmap::SUPPORTED;
 }
@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
+        float rope_freq_base,
+        float rope_freq_scale,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
        }
        hparams.n_ctx = n_ctx;
+        hparams.rope_freq_base  = rope_freq_base;
+        hparams.rope_freq_scale = rope_freq_scale;
    }
    const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
    {
-        fprintf(stderr, "%s: format     = %s\n",  __func__, llama_file_version_name(file_version));
+        fprintf(stderr, "%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
-        fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
-        fprintf(stderr, "%s: n_embd     = %u\n",  __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_embd     = %u\n",   __func__, hparams.n_embd);
-        fprintf(stderr, "%s: n_mult     = %u\n",  __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_mult     = %u\n",   __func__, hparams.n_mult);
-        fprintf(stderr, "%s: n_head     = %u\n",  __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
-        fprintf(stderr, "%s: n_layer    = %u\n",  __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
-        fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
+        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot);
+        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
+        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
        fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
-        fprintf(stderr, "%s: n_ff       = %u\n",  __func__, n_ff);
+        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
-        fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
+        fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
    }
    if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
        const size_t mem_required =
            ctx_size +
            mmapped_size - vram_weights + // weights in VRAM not in memory
-            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at    (model.type);
+            MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
        // this is the memory required by one llama_state
        const size_t mem_required_state =
@@ -1297,6 +1316,8 @@ static bool llama_model_load(
        int n_gpu_layers,
        int main_gpu,
        float * tensor_split,
+        float rope_freq_base,
+        float rope_freq_scale,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
@@ -1305,7 +1326,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
    const int n_rot        = hparams.n_embd/hparams.n_head;
    const int n_gpu_layers = model.n_gpu_layers;
+    const float freq_base  = hparams.rope_freq_base;
+    const float freq_scale = hparams.rope_freq_scale;
    auto & mem_per_token = lctx.mem_per_token;
    auto & buf_compute   = lctx.buf_compute;
@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
            offload_func_kq(tmpq);
            ggml_set_name(tmpq, "tmpq");
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
            offload_func_kq(Kcur);
            ggml_set_name(Kcur, "Kcur");
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
            offload_func_kq(Qcur);
            ggml_set_name(Qcur, "Qcur");
@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
    }
    // Normalize the second derivatives
-    float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+    {
-    for (float & value : second_derivatives) {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-        value /= second_derivatives_sum;
+        if (second_derivatives_sum > 1e-6f) {
+            for (float & value : second_derivatives) {
+                value /= second_derivatives_sum;
+            }
+        } else {
+            for (float & value : second_derivatives) {
+                value = 1.0f / second_derivatives.size();
+            }
+        }
    }
    float cum_sum = 0.0f;
@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
          struct llama_context * guidance_ctx,
                         float   scale,
                         float   smooth_factor) {
-    int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
+    int64_t t_start_sample_us = ggml_time_us();
    assert(ctx);
    auto n_vocab = llama_n_vocab(ctx);
@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
+                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
+                params.progress_callback_user_data)) {
        delete model;
        fprintf(stderr, "%s: failed to load model\n", __func__);
        return nullptr;
@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
        return nullptr;
    }
-    llama_context * ctx = new llama_context(*model, model->vocab);
+    llama_context * ctx = new llama_context(*model);
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
            ctx->embedding.resize(hparams.n_embd);
        }
-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
    }
@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
    return 0;
 }
-int llama_tokenize(
+int llama_tokenize_with_model(
-        struct llama_context * ctx,
+    const struct llama_model * model,
                  const char * text,
                 llama_token * tokens,
                         int   n_max_tokens,
                        bool   add_bos) {
-    auto res = llama_tokenize(ctx->vocab, text, add_bos);
+    auto res = llama_tokenize(model->vocab, text, add_bos);
    if (n_max_tokens < (int) res.size()) {
        fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3581,8 +3615,29 @@ int llama_tokenize(
    return res.size();
 }
+int llama_tokenize(
+        struct llama_context * ctx,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
+}
+int llama_n_vocab_from_model(const struct llama_model * model) {
+    return model->vocab.id_to_token.size();
+}
+int llama_n_ctx_from_model(const struct llama_model * model) {
+    return model->hparams.n_ctx;
+}
+int llama_n_embd_from_model(const struct llama_model * model) {
+    return model->hparams.n_embd;
+}
 int llama_n_vocab(const struct llama_context * ctx) {
-    return ctx->vocab.id_to_token.size();
+    return ctx->model.vocab.id_to_token.size();
 }
 int llama_n_ctx(const struct llama_context * ctx) {
@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) {
    return ctx->model.hparams.n_embd;
 }
-int llama_get_vocab(
+int llama_get_vocab_from_model(
-        const struct llama_context * ctx,
+        const struct llama_model * model,
        const char * * strings,
        float  * scores,
        int capacity) {
-    int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
+    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
    for (int i = 0; i<n; ++i) {
-        strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
+        strings[i] = model->vocab.id_to_token[i].tok.c_str();
-        scores[i]  = ctx->vocab.id_to_token[i].score;
+        scores[i]  = model->vocab.id_to_token[i].score;
    }
    return n;
 }
+int llama_get_vocab(
+        const struct llama_context * ctx,
+        const char * * strings,
+        float  * scores,
+        int capacity) {
+    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
+}
 float * llama_get_logits(struct llama_context * ctx) {
    return ctx->logits.data();
 }
@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }
-const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
-    if (token >= llama_n_vocab(ctx)) {
+    if (token >= llama_n_vocab_from_model(model)) {
        return nullptr;
    }
-    return ctx->vocab.id_to_token[token].tok.c_str();
+    return model->vocab.id_to_token[token].tok.c_str();
+}
+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+    return llama_token_to_str_with_model(&ctx->model, token);
 }
 llama_token llama_token_bos() {

--- a/llama/llama.h
+++ b/llama/llama.h
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -115,6 +115,11 @@ extern "C" {
        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        float    rope_freq_base;  // RoPE base frequency
+        float    rope_freq_scale; // RoPE frequency scaling factor
        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
        // context pointer passed to the progress callback
@@ -174,6 +179,8 @@ extern "C" {
        int32_t n_eval;
    };
+    LLAMA_API int llama_max_devices();
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
@@ -296,10 +303,21 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);
+    LLAMA_API int llama_tokenize_with_model(
+        const struct llama_model * model,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
    // Get the vocabulary as output parameters.
    // Returns number of results.
    LLAMA_API int llama_get_vocab(
@@ -308,6 +326,12 @@ extern "C" {
                                 float * scores,
                                   int   capacity);
+    LLAMA_API int llama_get_vocab_from_model(
+              const struct llama_model * model,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
@@ -320,7 +344,13 @@ extern "C" {
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(
+            const struct llama_context * ctx,
+                           llama_token   token);
+    LLAMA_API const char * llama_token_to_str_with_model(
+              const struct llama_model * model,
+                           llama_token   token);
    // Special tokens
    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence