From 27ad57a69b85bf12420a27e9945e580cc280be57 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Fri, 14 Jul 2023 12:46:21 +0300
Subject: [PATCH 01/22] Metal: faster Q4_0 and Q4_1 matrix x vector kernels
 (#2212)

* 3-5% faster Q4_0 on Metal

* 7-25% faster Q4_1 on Metal

* Oops, forgot to delete the original Q4_1 kernel

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml-metal.m     |   8 +--
 ggml-metal.metal | 182 ++++++++++++++++++++++++++++-------------------
 2 files changed, 109 insertions(+), 81 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 02dc9beb9..c795ee227 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -739,12 +739,8 @@ void ggml_metal_graph_compute(
                                 [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
                                 [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
 
-                                if (src0t == GGML_TYPE_Q4_0) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q4_1) {
-                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
                                 else if (src0t == GGML_TYPE_Q2_K ||
                                          src0t == GGML_TYPE_Q3_K ||
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 30d60fa58..f094a1d40 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -395,9 +395,12 @@ kernel void kernel_mul_mat_q4_0_f32(
     // each thread in a SIMD group deals with 1 block.
     for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
 
+        float sumy = 0;
         for (int i = 0; i < QK4_0 / 4; i++) {
             y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
         }
+        sumy *= (-8.f);
 
         for (int row = 0; row < N_DST; row++) {
             // prefetch next x block
@@ -405,39 +408,50 @@ kernel void kernel_mul_mat_q4_0_f32(
 
             // calculate
             float d = qb_curr.d;
-            float2 acc = {0.0f, 0.0f};
+            float acc = sumy;
             for (int i = 0; i < 16; i++) {
-                acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
-                acc[1] += yl[i] + yl[i+16];
+                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
             }
-            sumf[row] += d * (acc[0] - 8.f*acc[1]);
+            sumf[row] += d * acc;
             qb_curr = qb_next;
         }
     }
 
-    for (int i = 0; i < QK4_0 / 4; i++) {
-        y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
-    }
-
-    for (int row = 0; row < N_DST; row++) {
-        // prefetch next x block
-        qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
-
-        // calculate
-        float d = qb_curr.d;
-        float2 acc = {0.0f, 0.0f};
-        for (int i = 0; i < 16; i++) {
-            acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
-            acc[1] += yl[i] + yl[i+16];
+    if (nb % N_SIMDWIDTH == 0) {
+        for (int row = 0; row < N_DST; ++row) {
+            all_sum = simd_sum(sumf[row]);
+            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+            }
         }
-        if (tiisg < nb % N_SIMDWIDTH) {
-            sumf[row] += d * (acc[0] - 8.f*acc[1]);
-        }
-        qb_curr = qb_next;
+    } else {
 
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
-            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+        }
+        sumy *= (-8.f);
+
+        for (int row = 0; row < N_DST; row++) {
+            // prefetch next x block
+            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
+
+            // calculate
+            float d = qb_curr.d;
+            float acc = sumy;
+            for (int i = 0; i < 16; i++) {
+                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
+            }
+            if (tiisg < nb % N_SIMDWIDTH) {
+                sumf[row] += d * acc;
+            }
+            qb_curr = qb_next;
+
+            all_sum = simd_sum(sumf[row]);
+            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+            }
         }
     }
 }
@@ -449,65 +463,83 @@ kernel void kernel_mul_mat_q4_1_f32(
         constant   int64_t & ne00,
         constant   int64_t & ne10,
         constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
+        constant   int64_t & ne01[[buffer(4)]],
         uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
-    const int nb = ne00/QK4_1;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-
-    device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+    const int nb = ne00/QK4_0;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
     device const float      * y = (device const float      *) src1 + r1*ne10;
+    block_q4_1 qb_curr, qb_next;
+    float4 y_curr[8];       // src1 vector cache
+    float sumf[N_DST]={0.f}, all_sum;
+    thread float * yl=(thread float *)y_curr;
 
-    const uint nth = tptg.x*tptg.y;
-    const uint ith = tptg.y*tpitg.x + tpitg.y;
-
-    const int ix = tpitg.y/4;           // 0 or 1
-    const int iy = tpitg.y - 4*ix;      // 0...3
-
-    const int first = 4 * iy;
-
-    float sumf = 0;
-
-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
-
-        const float d = (float)x[i].d;
-        const float m = (float)x[i].m;
-
-        device const uint8_t * xl = x[i].qs + first;
-        device const float   * yl = y + i * QK4_1 + first;
-
-        float2 acc = {0.0f, 0.0f};
-
-        for (int j = 0; j < 4; ++j) {
-
-            acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
-            acc[1] += yl[j+16] * (d * (xl[j] >>  4) + m);
+    // bootstrap
+    qb_curr = x[tiisg];
+    // each thread in a SIMD group deals with 1 block.
+    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
 
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
         }
 
-        sumf += acc[0] + acc[1];
+        for (int row = 0; row < N_DST; row++) {
+            // prefetch next x block
+            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
+
+            // calculate
+            const float d = qb_curr.d;
+            const float m = qb_curr.m;
+            float acc = 0.f;
+            for (int i = 0; i < 16; i++) {
+                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
+            }
+            sumf[row] += d * acc + m * sumy;
+            qb_curr = qb_next;
+        }
     }
 
-    sum[ith] = sumf;
+    if (nb % N_SIMDWIDTH == 0) {
+        for (int row = 0; row < N_DST; ++row) {
+            all_sum = simd_sum(sumf[row]);
+            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+            }
+        }
+    } else {
 
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+        }
+
+        for (int row = 0; row < N_DST; row++) {
+            // prefetch next x block
+            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
+
+            // calculate
+            const float d = qb_curr.d;
+            const float m = qb_curr.m;
+            float acc = 0.f;
+            for (int i = 0; i < 16; i++) {
+                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
+            }
+            if (tiisg < nb % N_SIMDWIDTH) {
+                sumf[row] += d * acc + m * sumy;
+            }
+            qb_curr = qb_next;
+
+            all_sum = simd_sum(sumf[row]);
+            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+            }
+        }
     }
 }
 

From 697966680b27d9b4f05668605b863cb9aea3e15f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 14 Jul 2023 16:36:41 +0300
Subject: [PATCH 02/22] ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM
 rope)

---
 ggml-cuda.cu | 54 ++++++++++++++++++++++++++--
 ggml.c       | 99 ++++++++++++++++++++++++++--------------------------
 2 files changed, 101 insertions(+), 52 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index e0d5e9156..920466aae 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1667,6 +1667,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
     dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
 
+static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
+    const int half_n_dims = ncols/4;
+
+    if (col >= half_n_dims) {
+        return;
+    }
+
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i = row*ncols + col;
+
+    const float col_theta_scale = powf(theta_scale, col);
+
+    const float theta = p*col_theta_scale;
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + half_n_dims];
+
+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
+
+    const float block_theta = block_p*col_theta_scale;
+    const float sin_block_theta = sinf(block_theta);
+    const float cos_block_theta = cosf(block_theta);
+
+    const float x2 = x[i + half_n_dims * 2];
+    const float x3 = x[i + half_n_dims * 3];
+
+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
+}
+
 static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
     const int col = blockDim.x*blockIdx.x + threadIdx.x;
     const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -2064,6 +2098,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
     rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
 }
 
+static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
+    GGML_ASSERT(nrows % 4 == 0);
+    const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(num_blocks_x, nrows, 1);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
+}
+
 static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
     const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
     const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2618,13 +2660,21 @@ inline void ggml_cuda_op_rope(
     const int n_past = ((int32_t *) src1->data)[0];
     const int n_dims = ((int32_t *) src1->data)[1];
     const int mode   = ((int32_t *) src1->data)[2];
-    GGML_ASSERT(mode == 0);
+    const int n_ctx  = ((int32_t *) src1->data)[3];
 
     const float theta_scale = powf(10000.0, -2.0f/n_dims);
     const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
 
+    bool is_glm = mode & 4;
+
     // compute
-    rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
+    if (is_glm) {
+        const float id_p = min(p, n_ctx - 2.f);
+        const float block_p = max(p - (n_ctx - 2.f), 0.f);
+        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
+    } else {
+        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
+    }
 
     (void) dst;
     (void) src0_ddq_i;
diff --git a/ggml.c b/ggml.c
index c137ae658..f5821f1f1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10684,6 +10684,8 @@ static void ggml_compute_forward_mul_mat(
 
     const enum ggml_type type = src0->type;
 
+    const bool src1_cont = ggml_is_contiguous(src1);
+
     ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
     enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
     ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
@@ -10747,7 +10749,7 @@ static void ggml_compute_forward_mul_mat(
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
                 if (type != GGML_TYPE_F32) {
-                    float * const wdata = params->wdata;
+                            float * const wdata    = params->wdata;
                     ggml_to_float_t const to_float = type_traits[type].to_float;
 
                     size_t id = 0;
@@ -10805,7 +10807,7 @@ static void ggml_compute_forward_mul_mat(
     // src1 rows
     const int64_t nr1 = ne11*ne12*ne13;
 
-    void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
     const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
 
     for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
@@ -10828,7 +10830,15 @@ static void ggml_compute_forward_mul_mat(
         const int64_t i3 = i13;
 
         const char * src0_row = (const char *) src0->data + (  0 + i02*nb02 + i03*nb03     );
-        const char * src1_col = (const char *)      wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
+
+        // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+        //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+        //       the original src1 data pointer, so we should index using the indices directly
+        // TODO: this is a bit of a hack, we should probably have a better way to handle this
+        const char * src1_col = (const char *) wdata +
+            (src1_cont || src1->type != vec_dot_type
+             ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+             : (i11*nb11 + i12*nb12 + i13*nb13));
 
         float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
 
@@ -12982,12 +12992,13 @@ static void ggml_compute_forward_conv_1d(
     };
 }
 
-// ggml_compute_forward_conv_2d_sk_p0
+// ggml_compute_forward_conv_2d
 
-static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
+static void ggml_compute_forward_conv_2d_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt0,
               struct ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13007,28 +13018,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
     // size of the convolution row - the kernel size unrolled across all channels
     const int ew0 = nk0*nk1*ne02;
 
+    const int32_t s0 = ((const int32_t*)(opt0->data))[0];
+    const int32_t s1 = ((const int32_t*)(opt0->data))[1];
+    const int32_t p0 = ((const int32_t*)(opt0->data))[2];
+    const int32_t p1 = ((const int32_t*)(opt0->data))[3];
+    const int32_t d0 = ((const int32_t*)(opt0->data))[4];
+    const int32_t d1 = ((const int32_t*)(opt0->data))[5];
+
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
         // prepare source data (src1)
         {
             ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
-            for (int i13 = 0; i13 < ne13; i13++) {
-                for (int i12 = 0; i12 < ne12; i12++) {
-                    const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
-                    ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
+            for (int i12 = 0; i12 < ne12; i12++) {
+                const float * const src = (float *)((char *) src1->data + i12*nb12);
+                ggml_fp16_t * dst_data = wdata;
 
-                    for (int i1 = 0; i1 < ne1; i1++) {
-                        for (int i0 = 0; i0 < ne0; i0++) {
-                            for (int ik1 = 0; ik1 < nk1; ik1++) {
-                                for (int ik0 = 0; ik0 < nk0; ik0++) {
+                for (int i1 = 0; i1 < ne1; i1++) {
+                    for (int i0 = 0; i0 < ne0; i0++) {
+                        for (int ik1 = 0; ik1 < nk1; ik1++) {
+                            for (int ik0 = 0; ik0 < nk0; ik0++) {
+                                const int idx0 = i0*s0 + ik0*d0 - p0;
+                                const int idx1 = i1*s1 + ik1*d1 - p1;
+
+                                if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
                                     dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
-                                        GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
+                                        GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
                                 }
                             }
                         }
@@ -13071,19 +13091,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
     }
 }
 
-static void ggml_compute_forward_conv_2d_sk_p0(
+static void ggml_compute_forward_conv_2d(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        const struct ggml_tensor * opt0,
+        struct ggml_tensor * dst
+        ) {
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
+                //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
                 GGML_ASSERT(false);
             } break;
         default:
@@ -13093,32 +13115,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
     }
 }
 
-// ggml_compute_forward_conv_2d
-
-static void ggml_compute_forward_conv_2d(
-    const struct ggml_compute_params* params,
-    const struct ggml_tensor* src0,
-    const struct ggml_tensor* src1,
-    const struct ggml_tensor* opt0,
-    struct ggml_tensor* dst) {
-    const int32_t s0 = ((const int32_t*)(opt0->data))[0];
-    const int32_t s1 = ((const int32_t*)(opt0->data))[1];
-    const int32_t p0 = ((const int32_t*)(opt0->data))[2];
-    const int32_t p1 = ((const int32_t*)(opt0->data))[3];
-    const int32_t d0 = ((const int32_t*)(opt0->data))[4];
-    const int32_t d1 = ((const int32_t*)(opt0->data))[5];
-    GGML_ASSERT(d0 == 1); // dilation not supported
-    GGML_ASSERT(d1 == 1);
-    GGML_ASSERT(p0 == 0); // padding not supported
-    GGML_ASSERT(p1 == 0);
-
-    if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
-        ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
-    } else {
-        GGML_ASSERT(false); // only stride equal to kernel size is supported
-    }
-}
-
 // ggml_compute_forward_pool_1d_sk_p0
 
 static void ggml_compute_forward_pool_1d_sk_p0(
@@ -16575,19 +16571,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     const int64_t ne11 = node->src[1]->ne[1]; // H
                     const int64_t ne12 = node->src[1]->ne[2]; // C
 
+                    const int64_t ne0 = node->ne[0];
+                    const int64_t ne1 = node->ne[1];
+                    const int64_t ne2 = node->ne[2];
                     const int64_t nk = ne00*ne01;
+                    const int64_t ew0 = nk * ne02;
 
-                    UNUSED(ne02);
                     UNUSED(ne03);
-                    UNUSED(nk);
+                    UNUSED(ne2);
 
                     size_t cur = 0;
 
                     if (node->src[0]->type == GGML_TYPE_F16 &&
-                            node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
+                        node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
                     } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                            node->src[1]->type == GGML_TYPE_F32) {
+                               node->src[1]->type == GGML_TYPE_F32) {
                         cur = sizeof(float)*      (ne10*ne11*ne12);
                     } else {
                         GGML_ASSERT(false);

From 229aab351c375899debad45fcb213bf0565bba4e Mon Sep 17 00:00:00 2001
From: James Reynolds <magnusviri@users.noreply.github.com>
Date: Fri, 14 Jul 2023 11:34:40 -0600
Subject: [PATCH 03/22] make : fix combination of LLAMA_METAL and LLAMA_MPI
 (#2208)

Fixes https://github.com/ggerganov/llama.cpp/issues/2166 by moving commands after the CFLAGS are changed.
---
 Makefile | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index f887ed67e..0a4f0640d 100644
--- a/Makefile
+++ b/Makefile
@@ -151,9 +151,6 @@ ifdef LLAMA_MPI
 	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
 	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
 	OBJS     += ggml-mpi.o
-
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
-	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
 ifdef LLAMA_OPENBLAS
@@ -226,9 +223,6 @@ ifdef LLAMA_METAL
 	CXXFLAGS += -DGGML_USE_METAL
 	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	OBJS     += ggml-metal.o
-
-ggml-metal.o: ggml-metal.m ggml-metal.h
-	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_METAL
 
 ifneq ($(filter aarch64%,$(UNAME_M)),)
@@ -253,6 +247,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
+ifdef LLAMA_METAL
+ggml-metal.o: ggml-metal.m ggml-metal.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_METAL
+
+ifdef LLAMA_MPI
+ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_MPI
+
 ifdef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@

From 4304bd3cded73c867a882ea5ca4517e3995cc996 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 14 Jul 2023 19:44:08 +0200
Subject: [PATCH 04/22] CUDA: mul_mat_vec_q kernels for k-quants (#2203)

---
 ggml-cuda.cu | 357 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 328 insertions(+), 29 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 920466aae..4c9e21429 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -13,6 +13,8 @@
 #include "ggml-cuda.h"
 #include "ggml.h"
 
+#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
 
 #define QK4_0 32
 #define QR4_0 2
-#define QI4_0 4
+#define QI4_0 (QK4_0 / (4 * QR4_0))
 typedef struct {
     half    d;              // delta
     uint8_t qs[QK4_0 / 2];  // nibbles / quants
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
 
 #define QK4_1 32
 #define QR4_1 2
-#define QI4_1 4
+#define QI4_1 (QK4_1 / (4 * QR4_1))
 typedef struct {
     half    d;              // delta
     half    m;              // min
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
 
 #define QK5_0 32
 #define QR5_0 2
-#define QI5_0 4
+#define QI5_0 (QK5_0 / (4 * QR5_0))
 typedef struct {
     half d;                 // delta
     uint8_t qh[4];          // 5-th bit of quants
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
 
 #define QK5_1 32
 #define QR5_1 2
-#define QI5_1 4
+#define QI5_1 (QK5_1 / (4 * QR5_1))
 typedef struct {
     half d;                 // delta
     half m;                 // min
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
 
 #define QK8_0 32
 #define QR8_0 1
-#define QI8_0 8
+#define QI8_0 (QK8_0 / (4 * QR8_0))
 typedef struct {
     half    d;              // delta
     int8_t  qs[QK8_0];      // quants
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
 
 #define QK8_1 32
 #define QR8_1 1
-#define QI8_1 8
+#define QI8_1 (QK8_1 / (4 * QR8_1))
 typedef struct {
     half    d;              // delta
     half    s;              // unquantized sum
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
 #define K_SCALE_SIZE 12
 #endif
 
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
 typedef struct {
     uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
     uint8_t qs[QK_K/4];      // quants
@@ -151,6 +155,8 @@ typedef struct {
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
 
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
 typedef struct {
     uint8_t hmask[QK_K/8];     // quants - high bit
     uint8_t qs[QK_K/4];        // quants - low 2 bits
@@ -163,6 +169,8 @@ typedef struct {
 } block_q3_K;
 //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
 
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
 #ifdef GGML_QKK_64
 typedef struct {
     half    d[2];              // super-block scales/mins
@@ -180,6 +188,8 @@ typedef struct {
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
 #endif
 
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
 #ifdef GGML_QKK_64
 typedef struct {
     half d;                  // super-block scale
@@ -199,6 +209,8 @@ typedef struct {
 static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 #endif
 
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
 typedef struct {
     uint8_t ql[QK_K/2];   // quants, lower 4 bits
     uint8_t qh[QK_K/4];   // quants, upper 2 bits
@@ -1271,8 +1283,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
     y[iybs + iqs + y_offset] = v.y;
 }
 
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
 
     int vi;
@@ -1293,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
     return sumi*d;
 #else
     return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
 
     const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1318,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
     return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
 #else
     return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
 
     int qs;
@@ -1353,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
     return sumi*d;
 #else
     return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
 
     const int qs  = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1387,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
     return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
 #else
     return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
 
     int vi;
@@ -1406,7 +1423,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
     return sumi*d;
 #else
     return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 610
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    const float    d = bq2_K->d;
+    const float dmin = bq2_K->dmin;
+
+    const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
+
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = bq2_K->scales[scale_offset + 2*i];
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const float d8i = bq8i->d;
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * (sc & 0xF)); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >>  4)); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    float sumf = 0.0f;
+
+    const float d = bq3_K->d;
+
+    int vl;
+    memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
+
+    int vh;
+    memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
+    vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
+    vh >>= bq8_offset;
+
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi = __vsubss4(vil, vih);
+
+        sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    const int bq8_offset = QR4_K * (iqs / QI8_1);
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    const float    d = bq4_K->d;
+    const float dmin = bq4_K->dmin;
+
+    const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const int isc = bq8_offset + i;
+
+        uint8_t sc, m;
+        get_scale_min_k4(isc, bq4_K->scales, sc, m);
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+
+        const int vi = (v >> (4*i)) & 0x0F0F0F0F;
+
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    const int bq8_offset = QR5_K * (iqs / QI8_1);
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    const float    d = bq5_K->d;
+    const float dmin = bq5_K->dmin;
+
+    const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
+
+    const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
+
+    for (int i = 0; i < QR5_K; ++i) {
+        const int isc = bq8_offset + i;
+
+        uint8_t sc, m;
+        get_scale_min_k4(isc, bq5_K->scales, sc, m);
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> i) << 4) & 0x10101010;
+
+        const int vi = vil | vih;
+
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q5_K with sum of q8_1 values
+    }
+
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    float sumf = 0.0f;
+
+    const float d = bq6_K->d;
+
+    int vl;
+    memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
+
+    int vh;
+    memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
+
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = bq6_K->scales[scale_offset + 4*i];
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
+        const float d8i = bq8i->d;
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
+
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+        sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -1429,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
     for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
         const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
 
-        const int iby = i + threadIdx.x / qi; // y block index
+        const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
 
         const int iqs  = threadIdx.x % qi; // x block quant index when casting the quants to int
 
@@ -1962,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
 }
 
 static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK4_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1971,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
 }
 
 static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK4_1 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1980,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
 }
 
 static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK5_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1989,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
 }
 
 static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK5_1 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1998,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
 }
 
 static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % QK8_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2006,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
 static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
     dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2494,13 +2769,22 @@ inline void ggml_cuda_op_mul_mat_vec(
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
 
-    const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
+    bool mul_mat_vec_q_implemented =
+        src0->type == GGML_TYPE_Q4_0 ||
         src0->type == GGML_TYPE_Q4_1 ||
         src0->type == GGML_TYPE_Q5_0 ||
         src0->type == GGML_TYPE_Q5_1 ||
         src0->type == GGML_TYPE_Q8_0;
+#if QK_K == 256
+    mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
+        src0->type == GGML_TYPE_Q2_K ||
+        src0->type == GGML_TYPE_Q3_K ||
+        src0->type == GGML_TYPE_Q4_K ||
+        src0->type == GGML_TYPE_Q5_K ||
+        src0->type == GGML_TYPE_Q6_K;
+#endif // QK_K == 256
 
-    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
+    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
 #endif
 
     if (use_mul_mat_vec_q) {
@@ -2526,6 +2810,21 @@ inline void ggml_cuda_op_mul_mat_vec(
             case GGML_TYPE_Q8_0:
                 mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
                 break;
+            case GGML_TYPE_Q2_K:
+                mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q3_K:
+                mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q4_K:
+                mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q5_K:
+                mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q6_K:
+                mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
             default:
                 GGML_ASSERT(false);
                 break;

From 206e01de117cc65ed8713ac9fdebc57ba4532ec3 Mon Sep 17 00:00:00 2001
From: Jiahao Li <liplus17@163.com>
Date: Sat, 15 Jul 2023 02:38:24 +0800
Subject: [PATCH 05/22] cuda : support broadcast add & mul (#2192)

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-cuda.cu | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 4c9e21429..73cfc55e5 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -252,13 +252,13 @@ struct ggml_tensor_extra_gpu {
     cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
 };
 
-static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
-    if (i >= k) {
+    if (i >= kx) {
         return;
     }
-    dst[i] = x[i] + y[i];
+    dst[i] = x[i] + y[i%ky];
 }
 
 static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -1996,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
 
-static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
-    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
+    const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 }
 
 static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -2610,17 +2610,15 @@ inline void ggml_cuda_op_add(
     GGML_ASSERT(src1_ddf_i != nullptr);
     GGML_ASSERT(dst_ddf_i  != nullptr);
 
-    // TODO: support broadcasting
-    GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
-
     const int64_t ne00 = src0->ne[0];
     const int64_t i01_diff = i01_high - i01_low;
 
-    // const int64_t ne10 = src1->ne[0];
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
 
     // compute
     if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
+        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
         add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
     } else {
@@ -2644,19 +2642,12 @@ inline void ggml_cuda_op_mul(
     GGML_ASSERT(dst_ddf_i  != nullptr);
 
     const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
+
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
 
-    for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
-        const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
-
-        float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
-        float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
-        float * dst_ddf_i01  = dst_ddf_i + i01*ne00;
-
-        // compute
-        mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
-    }
+    mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
 
     (void) dst;
     (void) src0_ddq_i;

From c48c525f8711780f3f7c59bf92f1760f38317218 Mon Sep 17 00:00:00 2001
From: Shangning Xu <32517059+xushangning@users.noreply.github.com>
Date: Sat, 15 Jul 2023 02:40:05 +0800
Subject: [PATCH 06/22] examples : fixed path typos in embd-input (#2214)

---
 examples/embd-input/README.md | 2 +-
 examples/embd-input/llava.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md
index 02d028f26..5c4c75ea7 100644
--- a/examples/embd-input/README.md
+++ b/examples/embd-input/README.md
@@ -17,7 +17,7 @@ make
 import torch
 
 bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
-pth_path = "./examples/embd_input/llava_projection.pth"
+pth_path = "./examples/embd-input/llava_projection.pth"
 
 dic = torch.load(bin_path)
 used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py
index 2f20cb722..bcbdd2bed 100644
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@@ -59,7 +59,7 @@ if __name__=="__main__":
     # Also here can use pytorch_model-00003-of-00003.bin directly.
     a.load_projection(os.path.join(
         os.path.dirname(__file__) ,
-        "llava_projetion.pth"))
+        "llava_projection.pth"))
     respose = a.chat_with_image(
         Image.open("./media/llama1-logo.png").convert('RGB'),
         "what is the text in the picture?")

From de8342423d9600cf6e15455c1a27bae441262b45 Mon Sep 17 00:00:00 2001
From: Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
Date: Fri, 14 Jul 2023 11:50:58 -0700
Subject: [PATCH 07/22] build.zig : install config header (#2216)

---
 build.zig | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/build.zig b/build.zig
index 49c159ebf..2287d2a2c 100644
--- a/build.zig
+++ b/build.zig
@@ -1,9 +1,19 @@
 const std = @import("std");
+const commit_hash = @embedFile(".git/refs/heads/master");
 
-// Zig Version: 0.11.0-dev.3379+629f0d23b
+// Zig Version: 0.11.0-dev.3986+e05c242cd
 pub fn build(b: *std.build.Builder) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
+
+    const config_header = b.addConfigHeader(
+        .{ .style = .blank, .include_path = "build-info.h" },
+        .{
+            .BUILD_NUMBER = 0,
+            .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
+        },
+    );
+
     const lib = b.addStaticLibrary(.{
         .name = "llama",
         .target = target,
@@ -13,24 +23,21 @@ pub fn build(b: *std.build.Builder) void {
     lib.linkLibCpp();
     lib.addIncludePath(".");
     lib.addIncludePath("./examples");
-    lib.addCSourceFiles(&.{
-        "ggml.c",
-    }, &.{"-std=c11"});
-    lib.addCSourceFiles(&.{
-        "llama.cpp",
-    }, &.{"-std=c++11"});
+    lib.addConfigHeader(config_header);
+    lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
+    lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
     b.installArtifact(lib);
 
     const examples = .{
         "main",
         "baby-llama",
         "embedding",
-        // "metal",
+        "metal",
         "perplexity",
         "quantize",
         "quantize-stats",
         "save-load-state",
-        // "server",
+        "server",
         "simple",
         "train-text-from-scratch",
     };
@@ -43,16 +50,19 @@ pub fn build(b: *std.build.Builder) void {
         });
         exe.addIncludePath(".");
         exe.addIncludePath("./examples");
+        exe.addConfigHeader(config_header);
         exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
+            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
             "examples/common.cpp",
         }, &.{"-std=c++11"});
         exe.linkLibrary(lib);
         b.installArtifact(exe);
+
         const run_cmd = b.addRunArtifact(exe);
         run_cmd.step.dependOn(b.getInstallStep());
         if (b.args) |args| run_cmd.addArgs(args);
-        const run_step = b.step("run_" ++ example_name, "Run the app");
+
+        const run_step = b.step("run-" ++ example_name, "Run the app");
         run_step.dependOn(&run_cmd.step);
     }
 }

From 7513b7b0a1c11faa00ad5a34d22681e5f07d32e4 Mon Sep 17 00:00:00 2001
From: Bach Le <bach@bullno1.com>
Date: Sat, 15 Jul 2023 02:55:24 +0800
Subject: [PATCH 08/22] llama : add functions that work directly on model
 (#2197)

* Remove vocab reference from context

* Add functions that works directly with model
---
 llama.cpp | 64 +++++++++++++++++++++++++++++++++++++++++--------------
 llama.h   | 25 +++++++++++++++++++++-
 2 files changed, 72 insertions(+), 17 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 2d09d6ce7..b0cd9417c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -303,7 +303,7 @@ struct llama_model {
 };
 
 struct llama_context {
-    llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
+    llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
 #ifdef GGML_USE_METAL
     ~llama_context() {
         if (ctx_metal) {
@@ -324,7 +324,6 @@ struct llama_context {
     int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
 
     const llama_model & model;
-    const llama_vocab & vocab;
 
     bool model_owner = false;
 
@@ -2697,7 +2696,7 @@ struct llama_context * llama_new_context_with_model(
         return nullptr;
     }
 
-    llama_context * ctx = new llama_context(*model, model->vocab);
+    llama_context * ctx = new llama_context(*model);
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
@@ -3535,13 +3534,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
     return 0;
 }
 
-int llama_tokenize(
-        struct llama_context * ctx,
+int llama_tokenize_with_model(
+    const struct llama_model * model,
                   const char * text,
                  llama_token * tokens,
                          int   n_max_tokens,
                         bool   add_bos) {
-    auto res = llama_tokenize(ctx->vocab, text, add_bos);
+    auto res = llama_tokenize(model->vocab, text, add_bos);
 
     if (n_max_tokens < (int) res.size()) {
         fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3555,8 +3554,29 @@ int llama_tokenize(
     return res.size();
 }
 
+int llama_tokenize(
+        struct llama_context * ctx,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
+}
+
+int llama_n_vocab_from_model(const struct llama_model * model) {
+    return model->vocab.id_to_token.size();
+}
+
+int llama_n_ctx_from_model(const struct llama_model * model) {
+    return model->hparams.n_ctx;
+}
+
+int llama_n_embd_from_model(const struct llama_model * model) {
+    return model->hparams.n_embd;
+}
+
 int llama_n_vocab(const struct llama_context * ctx) {
-    return ctx->vocab.id_to_token.size();
+    return ctx->model.vocab.id_to_token.size();
 }
 
 int llama_n_ctx(const struct llama_context * ctx) {
@@ -3567,17 +3587,25 @@ int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
+int llama_get_vocab_from_model(
+        const struct llama_model * model,
+        const char * * strings,
+        float  * scores,
+        int capacity) {
+    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
+    for (int i = 0; i<n; ++i) {
+        strings[i] = model->vocab.id_to_token[i].tok.c_str();
+        scores[i]  = model->vocab.id_to_token[i].score;
+    }
+    return n;
+}
+
 int llama_get_vocab(
         const struct llama_context * ctx,
         const char * * strings,
         float  * scores,
         int capacity) {
-    int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
-    for (int i = 0; i<n; ++i) {
-        strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
-        scores[i]  = ctx->vocab.id_to_token[i].score;
-    }
-    return n;
+    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
 }
 
 float * llama_get_logits(struct llama_context * ctx) {
@@ -3588,12 +3616,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
-    if (token >= llama_n_vocab(ctx)) {
+const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
+    if (token >= llama_n_vocab_from_model(model)) {
         return nullptr;
     }
 
-    return ctx->vocab.id_to_token[token].tok.c_str();
+    return model->vocab.id_to_token[token].tok.c_str();
+}
+
+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+    return llama_token_to_str_with_model(&ctx->model, token);
 }
 
 llama_token llama_token_bos() {
diff --git a/llama.h b/llama.h
index 4596b1ed4..e7c60f40c 100644
--- a/llama.h
+++ b/llama.h
@@ -270,10 +270,21 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
 
+    LLAMA_API int llama_tokenize_with_model(
+        const struct llama_model * model,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
+
     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 
+    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+
     // Get the vocabulary as output parameters.
     // Returns number of results.
     LLAMA_API int llama_get_vocab(
@@ -282,6 +293,12 @@ extern "C" {
                                  float * scores,
                                    int   capacity);
 
+    LLAMA_API int llama_get_vocab_from_model(
+              const struct llama_model * model,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
+
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
     // Can be mutated in order to change the probabilities of the next token
@@ -294,7 +311,13 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(
+            const struct llama_context * ctx,
+                           llama_token   token);
+
+    LLAMA_API const char * llama_token_to_str_with_model(
+              const struct llama_model * model,
+                           llama_token   token);
 
     // Special tokens
     LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence

From e8035f141e1f71d739fa5cfc9c01531cdee6fc16 Mon Sep 17 00:00:00 2001
From: Evan Miller <emmiller@gmail.com>
Date: Fri, 14 Jul 2023 14:55:56 -0400
Subject: [PATCH 09/22] ggml : fix static_assert with older compilers #2024
 (#2218)

---
 ggml.c     | 10 ++++++----
 k_quants.h |  8 ++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index f5821f1f1..3ea8ba6ec 100644
--- a/ggml.c
+++ b/ggml.c
@@ -31,11 +31,17 @@
 #include <unistd.h>
 #endif
 
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
+#endif
 
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
@@ -112,10 +118,6 @@ typedef void * thread_ret_t;
 #endif
 #endif
 
-#ifdef __HAIKU__
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#endif
-
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
diff --git a/k_quants.h b/k_quants.h
index 6abe3d7b8..adc6a3913 100644
--- a/k_quants.h
+++ b/k_quants.h
@@ -15,6 +15,14 @@
 #define K_SCALE_SIZE 12
 #endif
 
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
 //
 // Super-block quantization structures
 //

From 7cdd30bf1f84339c55a5e3de29384f6bbdebb61c Mon Sep 17 00:00:00 2001
From: Bach Le <bach@bullno1.com>
Date: Sat, 15 Jul 2023 03:00:58 +0800
Subject: [PATCH 10/22] cuda : allocate all temporary ggml_tensor_extra_gpu
 from a fixed-size buffer (#2220)

---
 ggml-cuda.cu | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 73cfc55e5..0646fa7b2 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3646,6 +3646,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
     delete extra;
 }
 
+static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+
+static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+    if (g_temp_tensor_extras == nullptr) {
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+    }
+
+    size_t alloc_index = g_temp_tensor_extra_index;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+    struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    memset(extra, 0, sizeof(*extra));
+
+    return extra;
+}
+
 void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
     if (scratch && g_scratch_size == 0) {
         return;
@@ -3663,8 +3679,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
     }
 
     tensor->backend = GGML_BACKEND_GPU;
-    struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
-    memset(extra, 0, sizeof(*extra));
+    struct ggml_tensor_extra_gpu * extra;
 
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW ||
@@ -3679,10 +3694,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
         if (tensor->op == GGML_OP_VIEW) {
             memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
         }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
         struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
         void * src1_ddv = src1_extra->data_device[g_main_device];
+        extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src1_ddv;
     } else if (scratch) {
         GGML_ASSERT(size <= g_scratch_size);
@@ -3695,6 +3712,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
             CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
             g_scratch_buffer = data;
         }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = data + g_scratch_offset;
 
         g_scratch_offset += size;
@@ -3704,6 +3722,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
         void * data;
         CUDA_CHECK(cudaMalloc(&data, size));
         CUDA_CHECK(cudaMemset(data, 0, size));
+        extra = new ggml_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
         extra->data_device[g_main_device] = data;
     }
 

From 7dabc66f3c63f8ea0f61bac346fa138e01df675f Mon Sep 17 00:00:00 2001
From: wzy <32936898+Freed-Wu@users.noreply.github.com>
Date: Sat, 15 Jul 2023 03:05:08 +0800
Subject: [PATCH 11/22] make : use pkg-config for OpenBLAS (#2222)

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 0a4f0640d..6c74e1346 100644
--- a/Makefile
+++ b/Makefile
@@ -154,8 +154,8 @@ ifdef LLAMA_MPI
 endif # LLAMA_MPI
 
 ifdef LLAMA_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
-	LDFLAGS += -lopenblas
+	CFLAGS  += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
+	LDFLAGS += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS
 
 ifdef LLAMA_BLIS

From a6803cab946c817fb7aaf2a40b317f5d3e373bd1 Mon Sep 17 00:00:00 2001
From: Dave Della Costa <ddellacosta+github@gmail.com>
Date: Fri, 14 Jul 2023 15:13:38 -0400
Subject: [PATCH 12/22] flake : add runHook preInstall/postInstall to
 installPhase so hooks function (#2224)

---
 flake.nix | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/flake.nix b/flake.nix
index cebb47b94..dda7ee7b7 100644
--- a/flake.nix
+++ b/flake.nix
@@ -43,6 +43,8 @@
             "-DLLAMA_METAL=ON"
           ]);
           installPhase = ''
+            runHook preInstall
+
             mkdir -p $out/bin
             mv bin/* $out/bin/
             mv $out/bin/main $out/bin/llama
@@ -51,6 +53,8 @@
             echo "#!${llama-python}/bin/python" > $out/bin/convert.py
             cat ${./convert.py} >> $out/bin/convert.py
             chmod +x $out/bin/convert.py
+
+            runHook postInstall
           '';
           meta.mainProgram = "llama";
         };

From 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f Mon Sep 17 00:00:00 2001
From: Xiao-Yong Jin <jinxiaoyong@gmail.com>
Date: Sat, 15 Jul 2023 06:34:16 -0400
Subject: [PATCH 13/22] llama : add custom RoPE (#2054)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Implement customizable RoPE

The original RoPE has pre-defined parameters

theta_i = 10000^(−2(i−1)/d), for i in [1, 2, ..., d/2]

Our customizable RoPE, ggml_rope_custom_inplace, uses

theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]

with the default matches the original

scale = 1.0
base = 10000

The new command line arguments
--rope-freq-base
--rope-freq-scale
set the two new RoPE parameter.

Recent researches show changing these two parameters extends the context limit with minimal loss.

1. Extending Context to 8K
   kaiokendev
   https://kaiokendev.github.io/til#extending-context-to-8k

2. Extending Context Window of Large Language Models via Positional Interpolation
   Shouyuan Chen, Sherman Wong, Liangjian Chen, Yuandong Tian
   https://arxiv.org/abs/2306.15595

3. NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation.
   https://www.reddit.com/user/bloc97
   https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/

For the bold, try adding the following command line parameters to your favorite model:
-c 16384 --rope-freq-base 80000 --rope-freq-scale 0.5

* ggml-metal: fix custom rope

* common: fix argument names in help

* llama: increase MEM_REQ_EVAL for MODEL_3B

It avoids crashing for quantized weights on CPU.
Better ways to calculate the required buffer size would be better.

* llama: make MEM_REQ_EVAL depend on n_ctx

* server: use proper Content-Type in curl examples

Without the header Content-Type: application/json, curl will POST with
Content-Type: application/x-www-form-urlencoded

Though our simple server doesn't care, the httplib.h used has a limit
with CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192

With Content-Type: application/json, we can send large json data.

* style : minor fixes, mostly indentations

* ggml : fix asserts

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/common.cpp        | 16 ++++++++
 examples/common.h          |  2 +
 examples/main/main.cpp     | 12 +++++-
 examples/server/README.md  |  1 +
 examples/server/chat.sh    |  2 +
 examples/server/server.cpp | 18 ++++++++
 ggml-metal.m               | 45 +++++++++++---------
 ggml-metal.metal           |  6 ++-
 ggml.c                     | 50 +++++++++++++++++------
 ggml.h                     | 11 +++++
 llama.cpp                  | 84 +++++++++++++++++++++++---------------
 llama.h                    |  5 +++
 12 files changed, 185 insertions(+), 67 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 94875b054..8705127cb 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_ctx = std::stoi(argv[i]);
+        } else if (arg == "--rope-freq-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_base = std::stof(argv[i]);
+        } else if (arg == "--rope-freq-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_scale = std::stof(argv[i]);
         } else if (arg == "--memory-f32") {
             params.memory_f16 = false;
         } else if (arg == "--top-p") {
@@ -493,6 +505,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
     fprintf(stderr, "  --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
     fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
     fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
     fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -573,6 +587,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     lparams.use_mlock    = params.use_mlock;
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
+    lparams.rope_freq_base  = params.rope_freq_base;
+    lparams.rope_freq_scale = params.rope_freq_scale;
 
     return lparams;
 }
diff --git a/examples/common.h b/examples/common.h
index 6315df961..f52fef629 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -32,6 +32,8 @@ struct gpt_params {
     int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
     int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
+    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
+    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 2248c2458..bcbcf12b0 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
+    if (params.rope_freq_base != 10000.0) {
+        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 1.0) {
+        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+    }
+
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
+                " you are on your own\n", __func__, params.n_ctx);
     } else if (params.n_ctx < 8) {
         fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
diff --git a/examples/server/README.md b/examples/server/README.md
index ad9b6bb08..e5ca8269b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
 ```sh
 curl --request POST \
     --url http://localhost:8080/completion \
+    --header "Content-Type: application/json" \
     --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
 ```
 
diff --git a/examples/server/chat.sh b/examples/server/chat.sh
index a89f8e908..014360121 100644
--- a/examples/server/chat.sh
+++ b/examples/server/chat.sh
@@ -32,6 +32,7 @@ tokenize() {
         --silent \
         --request POST \
         --url "${API_URL}/tokenize" \
+        --header "Content-Type: application/json" \
         --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
     | jq '.tokens[]'
 }
@@ -64,6 +65,7 @@ chat_completion() {
         --no-buffer \
         --request POST \
         --url "${API_URL}/completion" \
+        --header "Content-Type: application/json" \
         --data-raw "${DATA}")
 
     printf "\n"
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 296c5d646..f442f2b56 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     fprintf(stderr, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
     fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
@@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.n_ctx = std::stoi(argv[i]);
         }
+        else if (arg == "--rope-freq-base")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_base = std::stof(argv[i]);
+        }
+        else if (arg == "--rope-freq-scale")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_scale = std::stof(argv[i]);
+        }
         else if (arg == "--memory-f32" || arg == "--memory_f32")
         {
             params.memory_f16 = false;
diff --git a/ggml-metal.m b/ggml-metal.m
index c795ee227..ee205bcdf 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -881,28 +881,35 @@ void ggml_metal_graph_compute(
 
                             const int n_past = ((int32_t *)(src1->data))[0];
 
+                            float freq_base;
+                            float freq_scale;
+                            memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+                            memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
+
                             [encoder setComputePipelineState:ctx->pipeline_rope];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
-                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index f094a1d40..9f9a4fbd7 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -656,17 +656,19 @@ kernel void kernel_rope(
         constant       int & n_past,
         constant       int & n_dims,
         constant       int & mode,
+        constant     float & freq_base,
+        constant     float & freq_scale,
         uint3 tpig[[thread_position_in_grid]]) {
     const int64_t i3 = tpig[2];
     const int64_t i2 = tpig[1];
     const int64_t i1 = tpig[0];
 
     const bool is_neox = mode & 2;
-    const float theta_scale = pow(10000.0, -2.0f/n_dims);
+    const float theta_scale = pow(freq_base, -2.0f/n_dims);
 
     const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
 
-    float theta = (float)p;
+    float theta = freq_scale * (float)p;
 
     if (!is_neox) {
         for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
diff --git a/ggml.c b/ggml.c
index 3ea8ba6ec..5ce1da0e9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6956,6 +6956,8 @@ struct ggml_tensor * ggml_rope_impl(
         int                   n_past,
         int                   n_dims,
         int                   mode,
+        float                 freq_base,
+        float                 freq_scale,
         int                   n_ctx,
         bool                  inplace) {
     GGML_ASSERT(n_past >= 0);
@@ -6969,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl(
 
     ggml_scratch_save(ctx);
 
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
 
     ((int32_t *) b->data)[0] = n_past;
     ((int32_t *) b->data)[1] = n_dims;
     ((int32_t *) b->data)[2] = mode;
     ((int32_t *) b->data)[3] = n_ctx;
+    memcpy((int32_t *) b->data + 4, &freq_base,  sizeof(float));
+    memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
 
     ggml_scratch_load(ctx);
 
@@ -6993,7 +6997,7 @@ struct ggml_tensor * ggml_rope(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
 }
 
 struct ggml_tensor * ggml_rope_inplace(
@@ -7003,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
+}
+
+struct ggml_tensor * ggml_rope_custom_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode,
+        float                 freq_base,
+        float                 freq_scale,
+        int                   n_ctx) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
 }
 
 // ggml_rope_back
@@ -12074,16 +12090,21 @@ static void ggml_compute_forward_rope_f32(
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
+    float freq_base;
+    float freq_scale;
+
     const int n_past = ((int32_t *) src1->data)[0];
     const int n_dims = ((int32_t *) src1->data)[1];
     const int mode   = ((int32_t *) src1->data)[2];
     const int n_ctx  = ((int32_t *) src1->data)[3];
+    memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
 
     assert(n_past >= 0);
 
@@ -12112,7 +12133,7 @@ static void ggml_compute_forward_rope_f32(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12124,7 +12145,7 @@ static void ggml_compute_forward_rope_f32(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
 
                 if (is_glm) {
                     theta = MIN(p, n_ctx - 2);
@@ -12201,16 +12222,21 @@ static void ggml_compute_forward_rope_f16(
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
+    float freq_base;
+    float freq_scale;
+
     const int n_past = ((int32_t *) src1->data)[0];
     const int n_dims = ((int32_t *) src1->data)[1];
     const int mode   = ((int32_t *) src1->data)[2];
     const int n_ctx  = ((int32_t *) src1->data)[3];
+    memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
 
     assert(n_past >= 0);
 
@@ -12239,7 +12265,7 @@ static void ggml_compute_forward_rope_f16(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12251,7 +12277,7 @@ static void ggml_compute_forward_rope_f16(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
 
                 if (is_glm) {
                     theta = MIN(p, n_ctx - 2);
@@ -12312,7 +12338,7 @@ static void ggml_compute_forward_rope_f16(
                             const float x0 = GGML_FP16_TO_FP32(src[0]);
                             const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
 
-                            dst_data[0]     = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                             dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                         }
                     }
@@ -15710,7 +15736,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama
                 if (src0->grad) {
                     assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 4);
+                    assert(ggml_nelements(src1) == 6);
                     const int n_past = ((int32_t *) src1->data)[0];
                     const int n_dims = ((int32_t *) src1->data)[1];
                     const int mode   = ((int32_t *) src1->data)[2];
@@ -15731,7 +15757,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 if (src0->grad) {
                     assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 4);
+                    assert(ggml_nelements(src1) == 3);
                     const int n_past = ((int32_t *) src1->data)[0];
                     const int n_dims = ((int32_t *) src1->data)[1];
                     const int mode   = ((int32_t *) src1->data)[2];
diff --git a/ggml.h b/ggml.h
index b88c35bae..24856a255 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1121,6 +1121,17 @@ extern "C" {
             int                   mode,
             int                   n_ctx);
 
+    // custom RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            float                 freq_base,
+            float                 freq_scale,
+            int                   n_ctx);
+
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
     GGML_API struct ggml_tensor * ggml_rope_back(
diff --git a/llama.cpp b/llama.cpp
index b0cd9417c..27e1ee964 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 // memory sizes
 //
 
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
 {
     static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,    256ull * MB },
-        { MODEL_7B,    512ull * MB },
-        { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,  1024ull * MB },
+        /* empirical scaling, still a guess */
+        { MODEL_3B,   ((size_t) n_ctx / 16ull + 128ull) * MB },
+        { MODEL_7B,   ((size_t) n_ctx / 16ull + 256ull) * MB },
+        { MODEL_13B,  ((size_t) n_ctx / 12ull + 256ull) * MB },
+        { MODEL_30B,  ((size_t) n_ctx / 10ull + 256ull) * MB },
+        { MODEL_65B,  ((size_t) n_ctx /  8ull + 512ull) * MB },
     };
     return k_sizes;
 }
@@ -140,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 
 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
 {
     static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,   512ull * MB },
-        { MODEL_7B,   768ull * MB },
-        { MODEL_13B, 1024ull * MB },
-        { MODEL_30B, 1280ull * MB },
-        { MODEL_65B, 1536ull * MB },
+        { MODEL_3B,  ((size_t) n_ctx / 256ull +  512ull) * MB },
+        { MODEL_7B,  ((size_t) n_ctx / 256ull +  768ull) * MB },
+        { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
+        { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
+        { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
     };
     return k_sizes;
 }
@@ -189,6 +190,10 @@ struct llama_hparams {
     uint32_t n_head  = 32;
     uint32_t n_layer = 32;
     uint32_t n_rot   = 64;
+
+    float rope_freq_base  = 10000.0f;
+    float rope_freq_scale = 1.0f;
+
     enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 
     bool operator!=(const llama_hparams & other) const {
@@ -647,7 +652,7 @@ struct llama_model_loader {
         *ctx_size_p = *mmapped_size_p = 0;
         for (const llama_load_tensor & lt : tensors_map.tensors) {
             *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
+            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
         }
     }
 
@@ -843,6 +848,8 @@ struct llama_context_params llama_context_default_params() {
         /*.gpu_layers                  =*/ 0,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ {0},
+        /*.rope_freq_base              =*/ 10000.0f,
+        /*.rope_freq_scale             =*/ 1.0f,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.low_vram                    =*/ false,
@@ -966,6 +973,8 @@ static void llama_model_load_internal(
         int n_gpu_layers,
         int main_gpu,
         const float * tensor_split,
+        float rope_freq_base,
+        float rope_freq_scale,
         bool low_vram,
         ggml_type memory_type,
         bool use_mmap,
@@ -1000,22 +1009,27 @@ static void llama_model_load_internal(
         }
 
         hparams.n_ctx = n_ctx;
+
+        hparams.rope_freq_base  = rope_freq_base;
+        hparams.rope_freq_scale = rope_freq_scale;
     }
 
     const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
 
     {
-        fprintf(stderr, "%s: format     = %s\n",  __func__, llama_file_version_name(file_version));
-        fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
-        fprintf(stderr, "%s: n_embd     = %u\n",  __func__, hparams.n_embd);
-        fprintf(stderr, "%s: n_mult     = %u\n",  __func__, hparams.n_mult);
-        fprintf(stderr, "%s: n_head     = %u\n",  __func__, hparams.n_head);
-        fprintf(stderr, "%s: n_layer    = %u\n",  __func__, hparams.n_layer);
-        fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
+        fprintf(stderr, "%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
+        fprintf(stderr, "%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_embd     = %u\n",   __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_mult     = %u\n",   __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot);
+        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
+        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
         fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
-        fprintf(stderr, "%s: n_ff       = %u\n",  __func__, n_ff);
-        fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
+        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
+        fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
     }
 
     if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1164,9 +1178,9 @@ static void llama_model_load_internal(
         const size_t mem_required =
             ctx_size +
             mmapped_size - vram_weights + // weights in VRAM not in memory
-            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
             MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at    (model.type);
+            MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
 
         // this is the memory required by one llama_state
         const size_t mem_required_state =
@@ -1270,6 +1284,8 @@ static bool llama_model_load(
         int n_gpu_layers,
         int main_gpu,
         float * tensor_split,
+        float rope_freq_base,
+        float rope_freq_scale,
         bool low_vram,
         ggml_type memory_type,
         bool use_mmap,
@@ -1278,7 +1294,7 @@ static bool llama_model_load(
         llama_progress_callback progress_callback,
         void *progress_callback_user_data) {
     try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
                                   use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
         return true;
     } catch (const std::exception & err) {
@@ -1330,6 +1346,9 @@ static bool llama_eval_internal(
     const int n_rot        = hparams.n_embd/hparams.n_head;
     const int n_gpu_layers = model.n_gpu_layers;
 
+    const float freq_base  = hparams.rope_freq_base;
+    const float freq_scale = hparams.rope_freq_scale;
+
     auto & mem_per_token = lctx.mem_per_token;
     auto & buf_compute   = lctx.buf_compute;
 
@@ -1427,11 +1446,11 @@ static bool llama_eval_internal(
             offload_func_kq(tmpq);
             ggml_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -2674,8 +2693,9 @@ struct llama_model * llama_load_model_from_file(
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
     if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
+                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
+                params.progress_callback_user_data)) {
         delete model;
         fprintf(stderr, "%s: failed to load model\n", __func__);
         return nullptr;
@@ -2750,9 +2770,9 @@ struct llama_context * llama_new_context_with_model(
             ctx->embedding.resize(hparams.n_embd);
         }
 
-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
 
-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
     }
 
diff --git a/llama.h b/llama.h
index e7c60f40c..e744584f2 100644
--- a/llama.h
+++ b/llama.h
@@ -89,6 +89,11 @@ extern "C" {
         int32_t  n_gpu_layers;                 // number of layers to store in VRAM
         int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
         float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        float    rope_freq_base;  // RoPE base frequency
+        float    rope_freq_scale; // RoPE frequency scaling factor
+
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
         // context pointer passed to the progress callback

From 27ab66e437797aedbb23b3599385756b6c26ac39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20Podiv=C3=ADn?=
 <66251151+jpodivin@users.noreply.github.com>
Date: Sun, 16 Jul 2023 21:54:47 +0200
Subject: [PATCH 14/22] py : turn verify-checksum-models.py into executable
 (#2245)

README.md was adjusted to reflect the change.

Signed-off-by: Jiri Podivin <jpodivin@gmail.com>
---
 README.md                         | 2 +-
 scripts/verify-checksum-models.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 scripts/verify-checksum-models.py

diff --git a/README.md b/README.md
index 476cc438b..f45e4bf08 100644
--- a/README.md
+++ b/README.md
@@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t
 
 ```bash
 # run the verification script
-python3 .\scripts\verify-checksum-models.py
+./scripts/verify-checksum-models.py
 ```
 
 - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py
old mode 100644
new mode 100755
index d12748281..307b7c08d
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -1,3 +1,5 @@
+#!/bin/env python3
+
 import os
 import hashlib
 

From 672dda10e4d8ac79df5d5970da7fb69d242ca9a7 Mon Sep 17 00:00:00 2001
From: Qingyou Meng <meng.qingyou@gmail.com>
Date: Mon, 17 Jul 2023 03:57:28 +0800
Subject: [PATCH 15/22] ggml : fixed runtime bugs and compile errors related to
 GGML_PERF and GGML_DEBUG (#2219)

* fixed runtime bugs and compile errors related to GGML_PERF and GGML_DEBUG

* remove ifdef GGML_PERF; update fmt
---
 ggml.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/ggml.c b/ggml.c
index 5ce1da0e9..c56a3d0e0 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4412,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
         if (&g_state.contexts[i].context == ctx) {
             g_state.contexts[i].used = false;
 
-            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
+            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
+                    __func__, i, ggml_used_mem(ctx));
 
             if (ctx->mem_buffer_owned) {
                 GGML_ALIGNED_FREE(ctx->mem_buffer);
@@ -16317,8 +16317,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 if (GGML_OP_HAS_FINALIZE[node->op]) {
                     params.nth = n_tasks_arr[node_n];
                     ggml_compute_forward(&params, node);
-                    ggml_graph_compute_perf_stats_node(node, state->shared);
                 }
+                ggml_graph_compute_perf_stats_node(node, state->shared);
             }
 
             // distribute new work or execute it direct if 1T
@@ -16348,8 +16348,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                     if (GGML_OP_HAS_FINALIZE[node->op]) {
                         params.type = GGML_TASK_FINALIZE;
                         ggml_compute_forward(&params, node);
-                        ggml_graph_compute_perf_stats_node(node, state->shared);
                     }
+
+                    ggml_graph_compute_perf_stats_node(node, state->shared);
                 } else {
                     break;
                 }
@@ -16891,9 +16892,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
 }
 
 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    //assert(cgraph->work      == NULL);
-    //assert(cgraph->work_size == 0);
-
     uint64_t size_eval = 0;
 
     // compute size of intermediate results
@@ -17332,9 +17330,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
 
     GGML_PRINT("=== GRAPH ===\n");
 
-    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
-
     GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];

From b7647436ccc80970b44a270f70f4f2ea139054d1 Mon Sep 17 00:00:00 2001
From: Alex Klinkhamer <git@grencez.dev>
Date: Sun, 16 Jul 2023 14:01:45 -0700
Subject: [PATCH 16/22] llama : fix t_start_sample_us initialization warning
 (#2238)

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 27e1ee964..0f9d5346d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2205,7 +2205,7 @@ void llama_sample_classifier_free_guidance(
           struct llama_context * guidance_ctx,
                          float   scale,
                          float   smooth_factor) {
-    int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
+    int64_t t_start_sample_us = ggml_time_us();
 
     assert(ctx);
     auto n_vocab = llama_n_vocab(ctx);

From 7568d1a2b206331412106ea66da3f871025e0c3c Mon Sep 17 00:00:00 2001
From: Jiahao Li <liplus17@163.com>
Date: Tue, 18 Jul 2023 01:39:29 +0800
Subject: [PATCH 17/22] Support dup & cont ops on CUDA (#2242)

---
 ggml-cuda.cu | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 0646fa7b2..d3054a7fa 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3537,6 +3537,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
     (void) dst;
 }
 
+void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
+
 void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -3670,7 +3675,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
     // recursively assign CUDA buffers until a compute tensor is found
     if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
         const ggml_op src0_op = tensor->src[0]->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
             ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
         }
     }
@@ -3776,6 +3781,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
         || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
 
     switch (tensor->op) {
+        case GGML_OP_DUP:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_dup;
+            break;
         case GGML_OP_ADD:
             if (!any_on_device) {
                 return false;
@@ -3830,6 +3841,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             }
             func = ggml_cuda_cpy;
             break;
+        case GGML_OP_CONT:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_dup;
+            break;
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:

From 6cbf9dfb32f0e23ed3afd02d30ab066ed53e2c4d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Jul 2023 11:50:49 +0300
Subject: [PATCH 18/22] llama : shorten quantization descriptions

---
 examples/quantize/quantize.cpp | 114 ++++++---------------------------
 1 file changed, 19 insertions(+), 95 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 797d2f0c5..744f549c5 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -14,103 +14,27 @@ struct quant_option {
 };
 
 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    {
-        "Q4_0",
-        LLAMA_FTYPE_MOSTLY_Q4_0,
-        " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
-    },
-    {
-        "Q4_1",
-        LLAMA_FTYPE_MOSTLY_Q4_1,
-        " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
-    },
-    {
-        "Q5_0",
-        LLAMA_FTYPE_MOSTLY_Q5_0,
-        " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
-    },
-    {
-        "Q5_1",
-        LLAMA_FTYPE_MOSTLY_Q5_1,
-        " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
-    },
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.50G, +0.2499 ppl @ 7B", },
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1846 ppl @ 7B", },
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.30G, +0.0796 ppl @ 7B", },
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0415 ppl @ 7B", },
 #ifdef GGML_USE_K_QUANTS
-    {
-        "Q2_K",
-        LLAMA_FTYPE_MOSTLY_Q2_K,
-        " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
-    },
-    {
-        "Q3_K",
-        LLAMA_FTYPE_MOSTLY_Q3_K_M,
-        "alias for Q3_K_M"
-    },
-    {
-        "Q3_K_S",
-        LLAMA_FTYPE_MOSTLY_Q3_K_S,
-        " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
-    },
-    {
-        "Q3_K_M",
-        LLAMA_FTYPE_MOSTLY_Q3_K_M,
-        " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
-    },
-    {
-        "Q3_K_L",
-        LLAMA_FTYPE_MOSTLY_Q3_K_L,
-        " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
-    },
-    {
-        "Q4_K",
-        LLAMA_FTYPE_MOSTLY_Q4_K_M,
-        "alias for Q4_K_M",
-    },
-    {
-        "Q4_K_S",
-        LLAMA_FTYPE_MOSTLY_Q4_K_S,
-        " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
-    },
-    {
-        "Q4_K_M",
-        LLAMA_FTYPE_MOSTLY_Q4_K_M,
-        " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
-    },
-    {
-        "Q5_K",
-        LLAMA_FTYPE_MOSTLY_Q5_K_M,
-        "alias for Q5_K_M",
-    },
-    {
-        "Q5_K_S",
-        LLAMA_FTYPE_MOSTLY_Q5_K_S,
-        " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
-    },
-    {
-        "Q5_K_M",
-        LLAMA_FTYPE_MOSTLY_Q5_K_M,
-        " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
-    },
-    {
-        "Q6_K",
-        LLAMA_FTYPE_MOSTLY_Q6_K,
-        " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
-    },
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.67G, +0.8698 ppl @ 7B", },
+    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
+    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
+    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0044 ppl @ 7B", },
 #endif
-    {
-        "Q8_0",
-        LLAMA_FTYPE_MOSTLY_Q8_0,
-        " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
-    },
-    {
-        "F16",
-        LLAMA_FTYPE_MOSTLY_F16,
-        "13.00G              @ 7B - extremely large, virtually no quality loss - not recommended",
-    },
-    {
-        "F32",
-        LLAMA_FTYPE_ALL_F32,
-        "26.00G              @ 7B - absolutely huge, lossless - not recommended",
-    },
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ 7B", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
+    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
 };
 
 

From d01bccde9f759b24449fdaa16306b406a50eb367 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Jul 2023 14:24:43 +0300
Subject: [PATCH 19/22] ci : integrate with ggml-org/ci (#2250)

* ci : run ctest

ggml-ci

* ci : add open llama 3B-v2 tests

ggml-ci

* ci : disable wget progress output

ggml-ci

* ci : add open llama 3B-v2 tg tests for q4 and q5 quantizations

ggml-ci

* tests : try to fix tail free sampling test

ggml-ci

* ci : add K-quants

ggml-ci

* ci : add short perplexity tests

ggml-ci

* ci : add README.md

* ppl : add --chunks argument to limit max number of chunks

ggml-ci

* ci : update README
---
 .gitignore                         |   5 +-
 ci/README.md                       |  20 +++
 ci/run.sh                          | 262 +++++++++++++++++++++++++++++
 examples/common.cpp                |   7 +
 examples/common.h                  |   1 +
 examples/perplexity/perplexity.cpp |   6 +-
 llama.cpp                          |  15 +-
 tests/test-sampling.cpp            |   2 +
 8 files changed, 312 insertions(+), 6 deletions(-)
 create mode 100644 ci/README.md
 create mode 100644 ci/run.sh

diff --git a/.gitignore b/.gitignore
index faec869e0..a23ac5928 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,8 @@ build/
 build-em/
 build-debug/
 build-release/
+build-ci-debug/
+build-ci-release/
 build-static/
 build-cublas/
 build-opencl/
@@ -25,9 +27,10 @@ build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 out/
+tmp/
 
 models/*
-*.bin
+models-mnt
 
 /main
 /quantize
diff --git a/ci/README.md b/ci/README.md
new file mode 100644
index 000000000..6c74c8138
--- /dev/null
+++ b/ci/README.md
@@ -0,0 +1,20 @@
+# CI
+
+In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+
+https://github.com/ggml-org/ci
+
+It monitors the `master` branch for new commits and runs the
+[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
+to cover various hardware architectures, including GPU and Apple Silicon instances.
+
+Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
+Only the branches of this repo are monitored for this keyword.
+
+It is a good practice, before publishing changes to execute the full CI locally on your machine:
+
+```bash
+mkdir tmp
+bash ./ci/run.sh ./tmp/results ./tmp/mnt
+```
diff --git a/ci/run.sh b/ci/run.sh
new file mode 100644
index 000000000..c823bc467
--- /dev/null
+++ b/ci/run.sh
@@ -0,0 +1,262 @@
+#/bin/bash
+
+if [ -z "$2" ]; then
+    echo "usage: $0 <output-dir> <mnt-dir>"
+    exit 1
+fi
+
+mkdir -p "$1"
+mkdir -p "$2"
+
+OUT=$(realpath "$1")
+MNT=$(realpath "$2")
+
+rm -v $OUT/*.log
+rm -v $OUT/*.exit
+rm -v $OUT/*.md
+
+sd=`dirname $0`
+cd $sd/../
+SRC=`pwd`
+
+## helpers
+
+# download a file if it does not exist or if it is outdated
+function gg_wget {
+    local out=$1
+    local url=$2
+
+    local cwd=`pwd`
+
+    mkdir -p $out
+    cd $out
+
+    # should not re-download if file is the same
+    wget -nv -N $url
+
+    cd $cwd
+}
+
+function gg_printf {
+    printf -- "$@" >> $OUT/README.md
+}
+
+function gg_run {
+    ci=$1
+
+    set -o pipefail
+    set -x
+
+    gg_run_$ci | tee $OUT/$ci.log
+    cur=$?
+    echo "$cur" > $OUT/$ci.exit
+
+    set +x
+    set +o pipefail
+
+    gg_sum_$ci
+
+    ret=$((ret | cur))
+}
+
+## ci
+
+# ctest_debug
+
+function gg_run_ctest_debug {
+    cd ${SRC}
+
+    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+
+    set +e
+}
+
+function gg_sum_ctest_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
+# ctest_release
+
+function gg_run_ctest_release {
+    cd ${SRC}
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    if [ -z $GG_BUILD_LOW_PERF ]; then
+        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    else
+        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    fi
+
+    set +e
+}
+
+function gg_sum_ctest_release {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+
+# open_llama_3b_v2
+
+function gg_run_open_llama_3b_v2 {
+    cd ${SRC}
+
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+
+    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+    path_models="../models-mnt/open-llama/3B-v2"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert.py ${path_models}
+
+    model_f16="${path_models}/ggml-model-f16.bin"
+    model_q8_0="${path_models}/ggml-model-q8_0.bin"
+    model_q4_0="${path_models}/ggml-model-q4_0.bin"
+    model_q4_1="${path_models}/ggml-model-q4_1.bin"
+    model_q5_0="${path_models}/ggml-model-q5_0.bin"
+    model_q5_1="${path_models}/ggml-model-q5_1.bin"
+    model_q3_k="${path_models}/ggml-model-q3_k.bin"
+    model_q4_k="${path_models}/ggml-model-q4_k.bin"
+    model_q5_k="${path_models}/ggml-model-q5_k.bin"
+    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    set +e
+}
+
+function gg_sum_open_llama_3b_v2 {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'OpenLLaMA 3B-v2:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+}
+
+## main
+
+if [ -z $GG_BUILD_LOW_PERF ]; then
+    rm -rf ${SRC}/models-mnt
+
+    mnt_models=$(realpath ${MNT}/models)
+    mkdir -p ${mnt_models}
+    ln -sfn ${mnt_models} ${SRC}/models-mnt
+
+    python3 -m pip install -r ${SRC}/requirements.txt
+fi
+
+ret=0
+
+#test $ret -eq 0 && gg_run ctest_debug
+#test $ret -eq 0 && gg_run ctest_release
+
+if [ -z $GG_BUILD_LOW_PERF ]; then
+    test $ret -eq 0 && gg_run open_llama_3b_v2
+fi
+
+exit $ret
diff --git a/examples/common.cpp b/examples/common.cpp
index 8705127cb..fd6dbc0e3 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_keep = std::stoi(argv[i]);
+        } else if (arg == "--chunks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_chunks = std::stoi(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
     fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stderr, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
     if (llama_mlock_supported()) {
         fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
diff --git a/examples/common.h b/examples/common.h
index f52fef629..037a4eecb 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -28,6 +28,7 @@ struct gpt_params {
     int32_t n_ctx                           = 512; // context size
     int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
+    int32_t n_chunks                        = -1;  // max number of chunks to process (-1 = unlimited)
     int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
     int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 7e120ff12..bfad99939 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
     // BOS tokens will be added for each chunk before eval
     auto tokens = ::llama_tokenize(ctx, params.prompt, true);
 
-    int count   = 0;
+    const int n_chunk_max = tokens.size() / params.n_ctx;
 
-    const int n_chunk = tokens.size() / params.n_ctx;
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
     const int n_vocab = llama_n_vocab(ctx);
     const int n_batch = params.n_batch;
 
+    int count = 0;
     double nll = 0.0;
+
     fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
     for (int i = 0; i < n_chunk; ++i) {
diff --git a/llama.cpp b/llama.cpp
index 0f9d5346d..fa3b7c03c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2024,9 +2024,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
     }
 
     // Normalize the second derivatives
-    float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-    for (float & value : second_derivatives) {
-        value /= second_derivatives_sum;
+    {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+
+        if (second_derivatives_sum > 1e-6f) {
+            for (float & value : second_derivatives) {
+                value /= second_derivatives_sum;
+            }
+        } else {
+            for (float & value : second_derivatives) {
+                value = 1.0f / second_derivatives.size();
+            }
+        }
     }
 
     float cum_sum = 0.0f;
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 64f9455d7..4437c3948 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -200,4 +200,6 @@ int main(void) {
     test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
 
     printf("OK\n");
+
+    return 0;
 }

From b1f429095328a34556c0e9a7a2fefced3db3368c Mon Sep 17 00:00:00 2001
From: wzy <32936898+Freed-Wu@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:01:11 +0800
Subject: [PATCH 20/22] cmake : install targets (#2256)

fix #2252
---
 CMakeLists.txt                                | 25 +++++++++++++++++++
 convert-lora-to-ggml.py                       |  1 +
 convert.py                                    |  1 +
 examples/baby-llama/CMakeLists.txt            |  1 +
 examples/benchmark/CMakeLists.txt             |  1 +
 examples/embd-input/CMakeLists.txt            |  2 ++
 examples/embedding/CMakeLists.txt             |  1 +
 examples/main/CMakeLists.txt                  |  1 +
 examples/metal/CMakeLists.txt                 |  1 +
 examples/perplexity/CMakeLists.txt            |  1 +
 examples/quantize-stats/CMakeLists.txt        |  1 +
 examples/quantize/CMakeLists.txt              |  1 +
 examples/save-load-state/CMakeLists.txt       |  1 +
 examples/server/CMakeLists.txt                |  1 +
 examples/simple/CMakeLists.txt                |  1 +
 .../train-text-from-scratch/CMakeLists.txt    |  1 +
 tests/CMakeLists.txt                          |  1 +
 17 files changed, 42 insertions(+)
 mode change 100644 => 100755 convert-lora-to-ggml.py
 mode change 100644 => 100755 convert.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9381dae1..abc96814d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -512,6 +512,7 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
     add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
     target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+    install(TARGETS ggml_shared LIBRARY)
 endif()
 
 add_library(llama
@@ -533,8 +534,32 @@ if (BUILD_SHARED_LIBS)
     if (LLAMA_METAL)
         set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
     endif()
+    install(TARGETS llama LIBRARY)
 endif()
 
+include(GNUInstallDirs)
+install(
+    FILES convert.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(
+    FILES convert-lora-to-ggml.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 #
 # programs, examples and tests
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
old mode 100644
new mode 100755
index f43c836f5..b4999ff5a
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import json
 import os
 import re
diff --git a/convert.py b/convert.py
old mode 100644
new mode 100755
index 7a2705e5c..e3f1096e1
--- a/convert.py
+++ b/convert.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import argparse
 import concurrent.futures
 import copy
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
index d2ce36367..7b70227a5 100644
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 037696194..3f3415350 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt
index 2b623953e..5bbb1ea02 100644
--- a/examples/embd-input/CMakeLists.txt
+++ b/examples/embd-input/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET embdinput)
 add_library(${TARGET} embd-input-lib.cpp embd-input.h)
+install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
@@ -8,6 +9,7 @@ endif()
 
 set(TARGET embd-input-test)
 add_executable(${TARGET} embd-input-test.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index db73b6b44..0c752c7bb 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET embedding)
 add_executable(${TARGET} embedding.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index c364242fb..cc1888948 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET main)
 add_executable(${TARGET} main.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt
index a8c4284a5..f16d49165 100644
--- a/examples/metal/CMakeLists.txt
+++ b/examples/metal/CMakeLists.txt
@@ -1,3 +1,4 @@
 set(TEST_TARGET metal)
 add_executable(${TEST_TARGET} metal.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index 61b17b828..af00b4e16 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET perplexity)
 add_executable(${TARGET} perplexity.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
index 7bebc11a1..c5c394058 100644
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 475fc8be8..47d0be72e 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index 08dbe5c2b..eadd13cdf 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 07ba76ad3..812a24b09 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -2,6 +2,7 @@ set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp json.hpp httplib.h)
+install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index 1568f7364..0ac9cb03a 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET simple)
 add_executable(${TARGET} simple.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt
index 1a44c4961..4459516d0 100644
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ b/examples/train-text-from-scratch/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET train-text-from-scratch)
 add_executable(${TARGET} train-text-from-scratch.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1acf050a7..11ec6c725 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,6 +1,7 @@
 function(llama_add_test source)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_executable(${TEST_TARGET} ${source})
+    install(TARGETS ${TEST_TARGET} RUNTIME)
     target_link_libraries(${TEST_TARGET} PRIVATE llama)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()

From 45a1b07e9b20c33d71d8c849ff27d693a75a0269 Mon Sep 17 00:00:00 2001
From: wzy <32936898+Freed-Wu@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:01:55 +0800
Subject: [PATCH 21/22] flake : update flake.nix (#2270)

When `isx86_32 || isx86_64`, it will use mkl, else openblas

According to
https://discourse.nixos.org/t/rpath-of-binary-contains-a-forbidden-reference-to-build/12200/3,
add -DCMAKE_SKIP_BUILD_RPATH=ON

Fix #2261, Nix doesn't provide mkl-sdl.pc.
When we build with -DBUILD_SHARED_LIBS=ON, -DLLAMA_BLAS_VENDOR=Intel10_lp64
replace mkl-sdl.pc by mkl-dynamic-lp64-iomp.pc
---
 CMakeLists.txt | 11 ++++++++-
 README.md      |  2 +-
 flake.nix      | 63 ++++++++++++++++++++++++++++----------------------
 3 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index abc96814d..169332767 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,7 +186,16 @@ if (LLAMA_BLAS)
                 pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
             elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
                 # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
+                pkg_check_modules(DepBLAS mkl-sdl)
+                if (NOT DepBLAS)
+                    if (BUILD_SHARED_LIBS)
+                        set(LINK_METHOD dynamic)
+                    else()
+                        set(LINK_METHOD static)
+                    endif()
+                    string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
+                    pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
+                endif()
             elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
                 # this doesn't provide pkg-config
                 # suggest to assign BLAS_INCLUDE_DIRS on your own
diff --git a/README.md b/README.md
index f45e4bf08..073b621e9 100644
--- a/README.md
+++ b/README.md
@@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements
   ```bash
   mkdir build
   cd build
-  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
   cmake --build . --config Release
   ```
 
diff --git a/flake.nix b/flake.nix
index dda7ee7b7..5657e8258 100644
--- a/flake.nix
+++ b/flake.nix
@@ -6,24 +6,27 @@
   outputs = { self, nixpkgs, flake-utils }:
     flake-utils.lib.eachDefaultSystem (system:
       let
-        inherit (pkgs.stdenv) isAarch64 isDarwin;
-        inherit (pkgs.lib) optionals;
-        isM1 = isAarch64 && isDarwin;
-        osSpecific = if isM1 then
-          with pkgs.darwin.apple_sdk_11_0.frameworks; [
-            Accelerate
-            MetalKit
-            MetalPerformanceShaders
-            MetalPerformanceShadersGraph
-          ]
-        else if isDarwin then
-          with pkgs.darwin.apple_sdk.frameworks; [
-            Accelerate
-            CoreGraphics
-            CoreVideo
-          ]
-        else
-          [ ];
+        inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin;
+        osSpecific = with pkgs; [ openmpi ] ++
+        (
+          if isAarch64 && isDarwin then
+            with pkgs.darwin.apple_sdk_11_0.frameworks; [
+              Accelerate
+              MetalKit
+              MetalPerformanceShaders
+              MetalPerformanceShadersGraph
+            ]
+          else if isAarch32 && isDarwin then
+            with pkgs.darwin.apple_sdk.frameworks; [
+              Accelerate
+              CoreGraphics
+              CoreVideo
+            ]
+          else if isx86_32 || isx86_64 then
+            with pkgs; [ mkl ]
+          else
+            with pkgs; [ openblas ]
+        );
         pkgs = import nixpkgs { inherit system; };
         llama-python =
           pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
@@ -31,22 +34,28 @@
         packages.default = pkgs.stdenv.mkDerivation {
           name = "llama.cpp";
           src = ./.;
-          postPatch = if isM1 then ''
+          postPatch = ''
             substituteInPlace ./ggml-metal.m \
               --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-          '' else
-            "";
-          nativeBuildInputs = with pkgs; [ cmake ];
+          '';
+          nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
           buildInputs = osSpecific;
-          cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
-            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-            "-DLLAMA_METAL=ON"
+          cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
+            ++ (if isAarch64 && isDarwin then [
+              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+              "-DLLAMA_METAL=ON"
+            ] else if isx86_32 || isx86_64 then [
+              "-DLLAMA_BLAS=ON"
+              "-DLLAMA_BLAS_VENDOR=Intel10_lp64"
+            ] else [
+              "-DLLAMA_BLAS=ON"
+              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
           ]);
           installPhase = ''
             runHook preInstall
 
-            mkdir -p $out/bin
-            mv bin/* $out/bin/
+            install -D bin/* -t $out/bin
+            install -Dm644 lib*.so -t $out/lib
             mv $out/bin/main $out/bin/llama
             mv $out/bin/server $out/bin/llama-server
 

From 294f424554c1599784ac9962462fc39ace92d8a5 Mon Sep 17 00:00:00 2001
From: Rinne <AsakusaRinne@gmail.com>
Date: Wed, 19 Jul 2023 15:06:40 +0800
Subject: [PATCH 22/22] llama : extend API to get max devices at runtime
 (#2253)

---
 llama.cpp | 4 ++++
 llama.h   | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index fa3b7c03c..3319b7023 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -875,6 +875,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
     return result;
 }
 
+int llama_max_devices() {
+    return LLAMA_MAX_DEVICES;
+}
+
 bool llama_mmap_supported() {
     return llama_mmap::SUPPORTED;
 }
diff --git a/llama.h b/llama.h
index e744584f2..b676a383b 100644
--- a/llama.h
+++ b/llama.h
@@ -153,6 +153,8 @@ extern "C" {
         int32_t n_eval;
     };
 
+    LLAMA_API int llama_max_devices();
+
     LLAMA_API struct llama_context_params llama_context_default_params();
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();