From 27ad57a69b85bf12420a27e9945e580cc280be57 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 14 Jul 2023 12:46:21 +0300 Subject: [PATCH 01/22] Metal: faster Q4_0 and Q4_1 matrix x vector kernels (#2212) * 3-5% faster Q4_0 on Metal * 7-25% faster Q4_1 on Metal * Oops, forgot to delete the original Q4_1 kernel --------- Co-authored-by: Iwan Kawrakow --- ggml-metal.m | 8 +-- ggml-metal.metal | 182 ++++++++++++++++++++++++++++------------------- 2 files changed, 109 insertions(+), 81 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 02dc9beb9..c795ee227 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -739,12 +739,8 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - if (src0t == GGML_TYPE_Q4_0) { - [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q4_1) { - [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q3_K || diff --git a/ggml-metal.metal b/ggml-metal.metal index 30d60fa58..f094a1d40 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -395,9 +395,12 @@ kernel void kernel_mul_mat_q4_0_f32( // each thread in a SIMD group deals with 1 block. for (int column = 0; column < nb / N_SIMDWIDTH; column++) { + float sumy = 0; for (int i = 0; i < QK4_0 / 4; i++) { y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; } + sumy *= (-8.f); for (int row = 0; row < N_DST; row++) { // prefetch next x block @@ -405,39 +408,50 @@ kernel void kernel_mul_mat_q4_0_f32( // calculate float d = qb_curr.d; - float2 acc = {0.0f, 0.0f}; + float acc = sumy; for (int i = 0; i < 16; i++) { - acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); - acc[1] += yl[i] + yl[i+16]; + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); } - sumf[row] += d * (acc[0] - 8.f*acc[1]); + sumf[row] += d * acc; qb_curr = qb_next; } } - for (int i = 0; i < QK4_0 / 4; i++) { - y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); - } - - for (int row = 0; row < N_DST; row++) { - // prefetch next x block - qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - - // calculate - float d = qb_curr.d; - float2 acc = {0.0f, 0.0f}; - for (int i = 0; i < 16; i++) { - acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); - acc[1] += yl[i] + yl[i+16]; + if (nb % N_SIMDWIDTH == 0) { + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } } - if (tiisg < nb % N_SIMDWIDTH) { - sumf[row] += d * (acc[0] - 8.f*acc[1]); - } - qb_curr = qb_next; + } else { - all_sum = simd_sum(sumf[row]); - if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { - dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; + } + sumy *= (-8.f); + + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; + + // calculate + float d = qb_curr.d; + float acc = sumy; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + if (tiisg < nb % N_SIMDWIDTH) { + sumf[row] += d * acc; + } + qb_curr = qb_next; + + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } } } } @@ -449,65 +463,83 @@ kernel void kernel_mul_mat_q4_1_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], + constant int64_t & ne01[[buffer(4)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { - const int nb = ne00/QK4_1; - - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; - - device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb; + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + const int nb = ne00/QK4_0; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb; device const float * y = (device const float *) src1 + r1*ne10; + block_q4_1 qb_curr, qb_next; + float4 y_curr[8]; // src1 vector cache + float sumf[N_DST]={0.f}, all_sum; + thread float * yl=(thread float *)y_curr; - const uint nth = tptg.x*tptg.y; - const uint ith = tptg.y*tpitg.x + tpitg.y; - - const int ix = tpitg.y/4; // 0 or 1 - const int iy = tpitg.y - 4*ix; // 0...3 - - const int first = 4 * iy; - - float sumf = 0; - - for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) { - - const float d = (float)x[i].d; - const float m = (float)x[i].m; - - device const uint8_t * xl = x[i].qs + first; - device const float * yl = y + i * QK4_1 + first; - - float2 acc = {0.0f, 0.0f}; - - for (int j = 0; j < 4; ++j) { - - acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m); - acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m); + // bootstrap + qb_curr = x[tiisg]; + // each thread in a SIMD group deals with 1 block. + for (int column = 0; column < nb / N_SIMDWIDTH; column++) { + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; } - sumf += acc[0] + acc[1]; + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH]; + + // calculate + const float d = qb_curr.d; + const float m = qb_curr.m; + float acc = 0.f; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + sumf[row] += d * acc + m * sumy; + qb_curr = qb_next; + } } - sum[ith] = sumf; + if (nb % N_SIMDWIDTH == 0) { + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } + } else { - // - // Accumulate the sum from all threads in the threadgroup - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; + } + + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; + + // calculate + const float d = qb_curr.d; + const float m = qb_curr.m; + float acc = 0.f; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + if (tiisg < nb % N_SIMDWIDTH) { + sumf[row] += d * acc + m * sumy; + } + qb_curr = qb_next; + + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } } } From 697966680b27d9b4f05668605b863cb9aea3e15f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Jul 2023 16:36:41 +0300 Subject: [PATCH 02/22] ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope) --- ggml-cuda.cu | 54 ++++++++++++++++++++++++++-- ggml.c | 99 ++++++++++++++++++++++++++-------------------------- 2 files changed, 101 insertions(+), 52 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e0d5e9156..920466aae 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1667,6 +1667,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c dst[i + 1] = x0*sin_theta + x1*cos_theta; } +static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { + const int col = blockDim.x*blockIdx.x + threadIdx.x; + const int half_n_dims = ncols/4; + + if (col >= half_n_dims) { + return; + } + + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int i = row*ncols + col; + + const float col_theta_scale = powf(theta_scale, col); + + const float theta = p*col_theta_scale; + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + half_n_dims]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; + + const float block_theta = block_p*col_theta_scale; + const float sin_block_theta = sinf(block_theta); + const float cos_block_theta = cosf(block_theta); + + const float x2 = x[i + half_n_dims * 2]; + const float x3 = x[i + half_n_dims * 3]; + + dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta; + dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; +} + static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { const int col = blockDim.x*blockIdx.x + threadIdx.x; const int row = blockDim.y*blockIdx.y + threadIdx.y; @@ -2064,6 +2098,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i rope_f32<<>>(x, dst, ncols, p, theta_scale); } +static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { + GGML_ASSERT(nrows % 4 == 0); + const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1); + const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(num_blocks_x, nrows, 1); + rope_glm_f32<<>>(x, dst, ncols, p, block_p, theta_scale); +} + static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1); const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; @@ -2618,13 +2660,21 @@ inline void ggml_cuda_op_rope( const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; - GGML_ASSERT(mode == 0); + const int n_ctx = ((int32_t *) src1->data)[3]; const float theta_scale = powf(10000.0, -2.0f/n_dims); const float p = ((mode & 1) == 0 ? n_past + i02 : i02); + bool is_glm = mode & 4; + // compute - rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); + if (is_glm) { + const float id_p = min(p, n_ctx - 2.f); + const float block_p = max(p - (n_ctx - 2.f), 0.f); + rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); + } else { + rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); + } (void) dst; (void) src0_ddq_i; diff --git a/ggml.c b/ggml.c index c137ae658..f5821f1f1 100644 --- a/ggml.c +++ b/ggml.c @@ -10684,6 +10684,8 @@ static void ggml_compute_forward_mul_mat( const enum ggml_type type = src0->type; + const bool src1_cont = ggml_is_contiguous(src1); + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; @@ -10747,7 +10749,7 @@ static void ggml_compute_forward_mul_mat( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); if (type != GGML_TYPE_F32) { - float * const wdata = params->wdata; + float * const wdata = params->wdata; ggml_to_float_t const to_float = type_traits[type].to_float; size_t id = 0; @@ -10805,7 +10807,7 @@ static void ggml_compute_forward_mul_mat( // src1 rows const int64_t nr1 = ne11*ne12*ne13; - void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { @@ -10828,7 +10830,15 @@ static void ggml_compute_forward_mul_mat( const int64_t i3 = i13; const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 ); - const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size; + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); @@ -12982,12 +12992,13 @@ static void ggml_compute_forward_conv_1d( }; } -// ggml_compute_forward_conv_2d_sk_p0 +// ggml_compute_forward_conv_2d -static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( +static void ggml_compute_forward_conv_2d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13007,28 +13018,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( // size of the convolution row - the kernel size unrolled across all channels const int ew0 = nk0*nk1*ne02; + const int32_t s0 = ((const int32_t*)(opt0->data))[0]; + const int32_t s1 = ((const int32_t*)(opt0->data))[1]; + const int32_t p0 = ((const int32_t*)(opt0->data))[2]; + const int32_t p1 = ((const int32_t*)(opt0->data))[3]; + const int32_t d0 = ((const int32_t*)(opt0->data))[4]; + const int32_t d1 = ((const int32_t*)(opt0->data))[5]; + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { - // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); // prepare source data (src1) { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i13 = 0; i13 < ne13; i13++) { - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); - ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); + for (int i12 = 0; i12 < ne12; i12++) { + const float * const src = (float *)((char *) src1->data + i12*nb12); + ggml_fp16_t * dst_data = wdata; - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + for (int ik1 = 0; ik1 < nk1; ik1++) { + for (int ik0 = 0; ik0 < nk0; ik0++) { + const int idx0 = i0*s0 + ik0*d0 - p0; + const int idx1 = i1*s1 + ik1*d1 - p1; + + if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]); + GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); } } } @@ -13071,19 +13091,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( } } -static void ggml_compute_forward_conv_2d_sk_p0( +static void ggml_compute_forward_conv_2d( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + const struct ggml_tensor * opt0, + struct ggml_tensor * dst + ) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst); } break; case GGML_TYPE_F32: { - //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst); + //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst); GGML_ASSERT(false); } break; default: @@ -13093,32 +13115,6 @@ static void ggml_compute_forward_conv_2d_sk_p0( } } -// ggml_compute_forward_conv_2d - -static void ggml_compute_forward_conv_2d( - const struct ggml_compute_params* params, - const struct ggml_tensor* src0, - const struct ggml_tensor* src1, - const struct ggml_tensor* opt0, - struct ggml_tensor* dst) { - const int32_t s0 = ((const int32_t*)(opt0->data))[0]; - const int32_t s1 = ((const int32_t*)(opt0->data))[1]; - const int32_t p0 = ((const int32_t*)(opt0->data))[2]; - const int32_t p1 = ((const int32_t*)(opt0->data))[3]; - const int32_t d0 = ((const int32_t*)(opt0->data))[4]; - const int32_t d1 = ((const int32_t*)(opt0->data))[5]; - GGML_ASSERT(d0 == 1); // dilation not supported - GGML_ASSERT(d1 == 1); - GGML_ASSERT(p0 == 0); // padding not supported - GGML_ASSERT(p1 == 0); - - if (s0 == src0->ne[0] && s1 == src0->ne[1]) { - ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst); - } else { - GGML_ASSERT(false); // only stride equal to kernel size is supported - } -} - // ggml_compute_forward_pool_1d_sk_p0 static void ggml_compute_forward_pool_1d_sk_p0( @@ -16575,19 +16571,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { const int64_t ne11 = node->src[1]->ne[1]; // H const int64_t ne12 = node->src[1]->ne[2]; // C + const int64_t ne0 = node->ne[0]; + const int64_t ne1 = node->ne[1]; + const int64_t ne2 = node->ne[2]; const int64_t nk = ne00*ne01; + const int64_t ew0 = nk * ne02; - UNUSED(ne02); UNUSED(ne03); - UNUSED(nk); + UNUSED(ne2); size_t cur = 0; if (node->src[0]->type == GGML_TYPE_F16 && - node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { + node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)* (ne10*ne11*ne12); } else { GGML_ASSERT(false); From 229aab351c375899debad45fcb213bf0565bba4e Mon Sep 17 00:00:00 2001 From: James Reynolds Date: Fri, 14 Jul 2023 11:34:40 -0600 Subject: [PATCH 03/22] make : fix combination of LLAMA_METAL and LLAMA_MPI (#2208) Fixes https://github.com/ggerganov/llama.cpp/issues/2166 by moving commands after the CFLAGS are changed. --- Makefile | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index f887ed67e..0a4f0640d 100644 --- a/Makefile +++ b/Makefile @@ -151,9 +151,6 @@ ifdef LLAMA_MPI CFLAGS += -DGGML_USE_MPI -Wno-cast-qual CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual OBJS += ggml-mpi.o - -ggml-mpi.o: ggml-mpi.c ggml-mpi.h - $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI ifdef LLAMA_OPENBLAS @@ -226,9 +223,6 @@ ifdef LLAMA_METAL CXXFLAGS += -DGGML_USE_METAL LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders OBJS += ggml-metal.o - -ggml-metal.o: ggml-metal.m ggml-metal.h - $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_METAL ifneq ($(filter aarch64%,$(UNAME_M)),) @@ -253,6 +247,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif +ifdef LLAMA_METAL +ggml-metal.o: ggml-metal.m ggml-metal.h + $(CC) $(CFLAGS) -c $< -o $@ +endif # LLAMA_METAL + +ifdef LLAMA_MPI +ggml-mpi.o: ggml-mpi.c ggml-mpi.h + $(CC) $(CFLAGS) -c $< -o $@ +endif # LLAMA_MPI + ifdef LLAMA_NO_K_QUANTS k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ From 4304bd3cded73c867a882ea5ca4517e3995cc996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 14 Jul 2023 19:44:08 +0200 Subject: [PATCH 04/22] CUDA: mul_mat_vec_q kernels for k-quants (#2203) --- ggml-cuda.cu | 357 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 328 insertions(+), 29 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 920466aae..4c9e21429 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -13,6 +13,8 @@ #include "ggml-cuda.h" #include "ggml.h" +#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)( #define QK4_0 32 #define QR4_0 2 -#define QI4_0 4 +#define QI4_0 (QK4_0 / (4 * QR4_0)) typedef struct { half d; // delta uint8_t qs[QK4_0 / 2]; // nibbles / quants @@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 #define QK4_1 32 #define QR4_1 2 -#define QI4_1 4 +#define QI4_1 (QK4_1 / (4 * QR4_1)) typedef struct { half d; // delta half m; // min @@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong #define QK5_0 32 #define QR5_0 2 -#define QI5_0 4 +#define QI5_0 (QK5_0 / (4 * QR5_0)) typedef struct { half d; // delta uint8_t qh[4]; // 5-th bit of quants @@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5 #define QK5_1 32 #define QR5_1 2 -#define QI5_1 4 +#define QI5_1 (QK5_1 / (4 * QR5_1)) typedef struct { half d; // delta half m; // min @@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + #define QK8_0 32 #define QR8_0 1 -#define QI8_0 8 +#define QI8_0 (QK8_0 / (4 * QR8_0)) typedef struct { half d; // delta int8_t qs[QK8_0]; // quants @@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo #define QK8_1 32 #define QR8_1 1 -#define QI8_1 8 +#define QI8_1 (QK8_1 / (4 * QR8_1)) typedef struct { half d; // delta half s; // unquantized sum @@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_ #define K_SCALE_SIZE 12 #endif +#define QR2_K 4 +#define QI2_K (QK_K / (4*QR2_K)) typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t qs[QK_K/4]; // quants @@ -151,6 +155,8 @@ typedef struct { } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); +#define QR3_K 4 +#define QI3_K (QK_K / (4*QR3_K)) typedef struct { uint8_t hmask[QK_K/8]; // quants - high bit uint8_t qs[QK_K/4]; // quants - low 2 bits @@ -163,6 +169,8 @@ typedef struct { } block_q3_K; //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); +#define QR4_K 2 +#define QI4_K (QK_K / (4*QR4_K)) #ifdef GGML_QKK_64 typedef struct { half d[2]; // super-block scales/mins @@ -180,6 +188,8 @@ typedef struct { static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); #endif +#define QR5_K 2 +#define QI5_K (QK_K / (4*QR5_K)) #ifdef GGML_QKK_64 typedef struct { half d; // super-block scale @@ -199,6 +209,8 @@ typedef struct { static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); #endif +#define QR6_K 2 +#define QI6_K (QK_K / (4*QR6_K)) typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits @@ -1271,8 +1283,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __ y[iybs + iqs + y_offset] = v.y; } -static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q4_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; int vi; @@ -1293,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q4_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]); @@ -1318,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q5_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; int qs; @@ -1353,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q5_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]); @@ -1387,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q8_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; int vi; @@ -1406,7 +1423,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q2_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + const float d = bq2_K->d; + const float dmin = bq2_K->dmin; + + const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]); + + for (int i = 0; i < QR2_K; ++i) { + const int sc = bq2_K->scales[scale_offset + 2*i]; + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + const float d8i = bq8i->d; + + const int vi = (v >> (2*i)) & 0x03030303; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + + sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product + sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values + } + + return d*sumf_d - dmin*sumf_m; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + float sumf = 0.0f; + + const float d = bq3_K->d; + + int vl; + memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int)); + + int vh; + memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int)); + vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted + vh >>= bq8_offset; + + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + const float d8i = bq8i->d; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = __vsubss4(vil, vih); + + sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + const int bq8_offset = QR4_K * (iqs / QI8_1); + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + const float d = bq4_K->d; + const float dmin = bq4_K->dmin; + + const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]); + + for (int i = 0; i < QR4_K; ++i) { + const int isc = bq8_offset + i; + + uint8_t sc, m; + get_scale_min_k4(isc, bq4_K->scales, sc, m); + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + const float d8i = bq8i->d; + + const int vi = (v >> (4*i)) & 0x0F0F0F0F; + + sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values + } + + return d*sumf_d - dmin*sumf_m; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q5_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int bq8_offset = QR5_K * (iqs / QI8_1); + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + const float d = bq5_K->d; + const float dmin = bq5_K->dmin; + + const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]); + + const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset; + + for (int i = 0; i < QR5_K; ++i) { + const int isc = bq8_offset + i; + + uint8_t sc, m; + get_scale_min_k4(isc, bq5_K->scales, sc, m); + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + const float d8i = bq8i->d; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> i) << 4) & 0x10101010; + + const int vi = vil | vih; + + sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values + } + + return d*sumf_d - dmin*sumf_m; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + float sumf = 0.0f; + + const float d = bq6_K->d; + + int vl; + memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int)); + + int vh; + memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int)); + + for (int i = 0; i < QR6_K; ++i) { + const int sc = bq6_K->scales[scale_offset + 4*i]; + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]); + const float d8i = bq8i->d; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030; + + const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + + sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template @@ -1429,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index - const int iby = i + threadIdx.x / qi; // y block index + const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int @@ -1962,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f } static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK4_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -1971,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * } static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK4_1 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -1980,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * } static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK5_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -1989,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * } static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK5_1 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -1998,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * } static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK8_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -2006,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * <<>>(vx, vy, dst, ncols, nrows); } +static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<1, 1, convert_f16><<>>(vx, y, k); @@ -2494,13 +2769,22 @@ inline void ggml_cuda_op_mul_mat_vec( int id; CUDA_CHECK(cudaGetDevice(&id)); - const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 || + bool mul_mat_vec_q_implemented = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0; +#if QK_K == 256 + mul_mat_vec_q_implemented = mul_mat_vec_q_implemented || + src0->type == GGML_TYPE_Q2_K || + src0->type == GGML_TYPE_Q3_K || + src0->type == GGML_TYPE_Q4_K || + src0->type == GGML_TYPE_Q5_K || + src0->type == GGML_TYPE_Q6_K; +#endif // QK_K == 256 - const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented; + const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented; #endif if (use_mul_mat_vec_q) { @@ -2526,6 +2810,21 @@ inline void ggml_cuda_op_mul_mat_vec( case GGML_TYPE_Q8_0: mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; default: GGML_ASSERT(false); break; From 206e01de117cc65ed8713ac9fdebc57ba4532ec3 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Sat, 15 Jul 2023 02:38:24 +0800 Subject: [PATCH 05/22] cuda : support broadcast add & mul (#2192) Co-authored-by: Georgi Gerganov --- ggml-cuda.cu | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4c9e21429..73cfc55e5 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -252,13 +252,13 @@ struct ggml_tensor_extra_gpu { cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs }; -static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) { +static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { + if (i >= kx) { return; } - dst[i] = x[i] + y[i]; + dst[i] = x[i] + y[i%ky]; } static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) { @@ -1996,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale dst[i] = scale * x[i]; } -static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; - add_f32<<>>(x, y, dst, k); +static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { + const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f32<<>>(x, y, dst, kx, ky); } static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) { @@ -2610,17 +2610,15 @@ inline void ggml_cuda_op_add( GGML_ASSERT(src1_ddf_i != nullptr); GGML_ASSERT(dst_ddf_i != nullptr); - // TODO: support broadcasting - GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1)); - const int64_t ne00 = src0->ne[0]; const int64_t i01_diff = i01_high - i01_low; - // const int64_t ne10 = src1->ne[0]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; // compute if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main); } else { @@ -2644,19 +2642,12 @@ inline void ggml_cuda_op_mul( GGML_ASSERT(dst_ddf_i != nullptr); const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - for (int64_t i01 = i01_low; i01 < i01_high; i01++) { - const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0 - - float * src0_ddf_i01 = src0_ddf_i + i01*ne00; - float * src1_ddf_i01 = src1_ddf_i + i11*ne10; - float * dst_ddf_i01 = dst_ddf_i + i01*ne00; - - // compute - mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main); - } + mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); (void) dst; (void) src0_ddq_i; From c48c525f8711780f3f7c59bf92f1760f38317218 Mon Sep 17 00:00:00 2001 From: Shangning Xu <32517059+xushangning@users.noreply.github.com> Date: Sat, 15 Jul 2023 02:40:05 +0800 Subject: [PATCH 06/22] examples : fixed path typos in embd-input (#2214) --- examples/embd-input/README.md | 2 +- examples/embd-input/llava.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md index 02d028f26..5c4c75ea7 100644 --- a/examples/embd-input/README.md +++ b/examples/embd-input/README.md @@ -17,7 +17,7 @@ make import torch bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin" -pth_path = "./examples/embd_input/llava_projection.pth" +pth_path = "./examples/embd-input/llava_projection.pth" dic = torch.load(bin_path) used_key = ["model.mm_projector.weight","model.mm_projector.bias"] diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py index 2f20cb722..bcbdd2bed 100644 --- a/examples/embd-input/llava.py +++ b/examples/embd-input/llava.py @@ -59,7 +59,7 @@ if __name__=="__main__": # Also here can use pytorch_model-00003-of-00003.bin directly. a.load_projection(os.path.join( os.path.dirname(__file__) , - "llava_projetion.pth")) + "llava_projection.pth")) respose = a.chat_with_image( Image.open("./media/llama1-logo.png").convert('RGB'), "what is the text in the picture?") From de8342423d9600cf6e15455c1a27bae441262b45 Mon Sep 17 00:00:00 2001 From: Ali Chraghi <63465728+alichraghi@users.noreply.github.com> Date: Fri, 14 Jul 2023 11:50:58 -0700 Subject: [PATCH 07/22] build.zig : install config header (#2216) --- build.zig | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/build.zig b/build.zig index 49c159ebf..2287d2a2c 100644 --- a/build.zig +++ b/build.zig @@ -1,9 +1,19 @@ const std = @import("std"); +const commit_hash = @embedFile(".git/refs/heads/master"); -// Zig Version: 0.11.0-dev.3379+629f0d23b +// Zig Version: 0.11.0-dev.3986+e05c242cd pub fn build(b: *std.build.Builder) void { const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); + + const config_header = b.addConfigHeader( + .{ .style = .blank, .include_path = "build-info.h" }, + .{ + .BUILD_NUMBER = 0, + .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline + }, + ); + const lib = b.addStaticLibrary(.{ .name = "llama", .target = target, @@ -13,24 +23,21 @@ pub fn build(b: *std.build.Builder) void { lib.linkLibCpp(); lib.addIncludePath("."); lib.addIncludePath("./examples"); - lib.addCSourceFiles(&.{ - "ggml.c", - }, &.{"-std=c11"}); - lib.addCSourceFiles(&.{ - "llama.cpp", - }, &.{"-std=c++11"}); + lib.addConfigHeader(config_header); + lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"}); + lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"}); b.installArtifact(lib); const examples = .{ "main", "baby-llama", "embedding", - // "metal", + "metal", "perplexity", "quantize", "quantize-stats", "save-load-state", - // "server", + "server", "simple", "train-text-from-scratch", }; @@ -43,16 +50,19 @@ pub fn build(b: *std.build.Builder) void { }); exe.addIncludePath("."); exe.addIncludePath("./examples"); + exe.addConfigHeader(config_header); exe.addCSourceFiles(&.{ - std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}), + std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }), "examples/common.cpp", }, &.{"-std=c++11"}); exe.linkLibrary(lib); b.installArtifact(exe); + const run_cmd = b.addRunArtifact(exe); run_cmd.step.dependOn(b.getInstallStep()); if (b.args) |args| run_cmd.addArgs(args); - const run_step = b.step("run_" ++ example_name, "Run the app"); + + const run_step = b.step("run-" ++ example_name, "Run the app"); run_step.dependOn(&run_cmd.step); } } From 7513b7b0a1c11faa00ad5a34d22681e5f07d32e4 Mon Sep 17 00:00:00 2001 From: Bach Le Date: Sat, 15 Jul 2023 02:55:24 +0800 Subject: [PATCH 08/22] llama : add functions that work directly on model (#2197) * Remove vocab reference from context * Add functions that works directly with model --- llama.cpp | 64 +++++++++++++++++++++++++++++++++++++++++-------------- llama.h | 25 +++++++++++++++++++++- 2 files changed, 72 insertions(+), 17 deletions(-) diff --git a/llama.cpp b/llama.cpp index 2d09d6ce7..b0cd9417c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -303,7 +303,7 @@ struct llama_model { }; struct llama_context { - llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} + llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} #ifdef GGML_USE_METAL ~llama_context() { if (ctx_metal) { @@ -324,7 +324,6 @@ struct llama_context { int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) const llama_model & model; - const llama_vocab & vocab; bool model_owner = false; @@ -2697,7 +2696,7 @@ struct llama_context * llama_new_context_with_model( return nullptr; } - llama_context * ctx = new llama_context(*model, model->vocab); + llama_context * ctx = new llama_context(*model); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); @@ -3535,13 +3534,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { return 0; } -int llama_tokenize( - struct llama_context * ctx, +int llama_tokenize_with_model( + const struct llama_model * model, const char * text, llama_token * tokens, int n_max_tokens, bool add_bos) { - auto res = llama_tokenize(ctx->vocab, text, add_bos); + auto res = llama_tokenize(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); @@ -3555,8 +3554,29 @@ int llama_tokenize( return res.size(); } +int llama_tokenize( + struct llama_context * ctx, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos) { + return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); +} + +int llama_n_vocab_from_model(const struct llama_model * model) { + return model->vocab.id_to_token.size(); +} + +int llama_n_ctx_from_model(const struct llama_model * model) { + return model->hparams.n_ctx; +} + +int llama_n_embd_from_model(const struct llama_model * model) { + return model->hparams.n_embd; +} + int llama_n_vocab(const struct llama_context * ctx) { - return ctx->vocab.id_to_token.size(); + return ctx->model.vocab.id_to_token.size(); } int llama_n_ctx(const struct llama_context * ctx) { @@ -3567,17 +3587,25 @@ int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } +int llama_get_vocab_from_model( + const struct llama_model * model, + const char * * strings, + float * scores, + int capacity) { + int n = std::min(capacity, (int) model->vocab.id_to_token.size()); + for (int i = 0; ivocab.id_to_token[i].tok.c_str(); + scores[i] = model->vocab.id_to_token[i].score; + } + return n; +} + int llama_get_vocab( const struct llama_context * ctx, const char * * strings, float * scores, int capacity) { - int n = std::min(capacity, (int) ctx->vocab.id_to_token.size()); - for (int i = 0; ivocab.id_to_token[i].tok.c_str(); - scores[i] = ctx->vocab.id_to_token[i].score; - } - return n; + return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity); } float * llama_get_logits(struct llama_context * ctx) { @@ -3588,12 +3616,16 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { - if (token >= llama_n_vocab(ctx)) { +const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) { + if (token >= llama_n_vocab_from_model(model)) { return nullptr; } - return ctx->vocab.id_to_token[token].tok.c_str(); + return model->vocab.id_to_token[token].tok.c_str(); +} + +const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { + return llama_token_to_str_with_model(&ctx->model, token); } llama_token llama_token_bos() { diff --git a/llama.h b/llama.h index 4596b1ed4..e7c60f40c 100644 --- a/llama.h +++ b/llama.h @@ -270,10 +270,21 @@ extern "C" { int n_max_tokens, bool add_bos); + LLAMA_API int llama_tokenize_with_model( + const struct llama_model * model, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos); + LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx); + LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); + LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); + LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); + // Get the vocabulary as output parameters. // Returns number of results. LLAMA_API int llama_get_vocab( @@ -282,6 +293,12 @@ extern "C" { float * scores, int capacity); + LLAMA_API int llama_get_vocab_from_model( + const struct llama_model * model, + const char * * strings, + float * scores, + int capacity); + // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row // Can be mutated in order to change the probabilities of the next token @@ -294,7 +311,13 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); + LLAMA_API const char * llama_token_to_str( + const struct llama_context * ctx, + llama_token token); + + LLAMA_API const char * llama_token_to_str_with_model( + const struct llama_model * model, + llama_token token); // Special tokens LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence From e8035f141e1f71d739fa5cfc9c01531cdee6fc16 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Fri, 14 Jul 2023 14:55:56 -0400 Subject: [PATCH 09/22] ggml : fix static_assert with older compilers #2024 (#2218) --- ggml.c | 10 ++++++---- k_quants.h | 8 ++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index f5821f1f1..3ea8ba6ec 100644 --- a/ggml.c +++ b/ggml.c @@ -31,11 +31,17 @@ #include #endif +// static_assert should be a #define, but if it's not, +// fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 #ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else #define static_assert(cond, msg) struct global_scope_noop_trick #endif +#endif #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts @@ -112,10 +118,6 @@ typedef void * thread_ret_t; #endif #endif -#ifdef __HAIKU__ -#define static_assert(cond, msg) _Static_assert(cond, msg) -#endif - /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 diff --git a/k_quants.h b/k_quants.h index 6abe3d7b8..adc6a3913 100644 --- a/k_quants.h +++ b/k_quants.h @@ -15,6 +15,14 @@ #define K_SCALE_SIZE 12 #endif +#ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else +#define static_assert(cond, msg) struct global_scope_noop_trick +#endif +#endif + // // Super-block quantization structures // From 7cdd30bf1f84339c55a5e3de29384f6bbdebb61c Mon Sep 17 00:00:00 2001 From: Bach Le Date: Sat, 15 Jul 2023 03:00:58 +0800 Subject: [PATCH 10/22] cuda : allocate all temporary ggml_tensor_extra_gpu from a fixed-size buffer (#2220) --- ggml-cuda.cu | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 73cfc55e5..0646fa7b2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3646,6 +3646,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { delete extra; } +static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; +static size_t g_temp_tensor_extra_index = 0; + +static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (g_temp_tensor_extras == nullptr) { + g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; + } + + size_t alloc_index = g_temp_tensor_extra_index; + g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES; + struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; +} + void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) { if (scratch && g_scratch_size == 0) { return; @@ -3663,8 +3679,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo } tensor->backend = GGML_BACKEND_GPU; - struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); + struct ggml_tensor_extra_gpu * extra; const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || tensor->op == GGML_OP_VIEW || @@ -3679,10 +3694,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo if (tensor->op == GGML_OP_VIEW) { memcpy(&offset, tensor->src[2]->data, sizeof(size_t)); } + extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src0_ddc + offset; } else if (tensor->op == GGML_OP_CPY) { struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; void * src1_ddv = src1_extra->data_device[g_main_device]; + extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src1_ddv; } else if (scratch) { GGML_ASSERT(size <= g_scratch_size); @@ -3695,6 +3712,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo CUDA_CHECK(cudaMalloc(&data, g_scratch_size)); g_scratch_buffer = data; } + extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = data + g_scratch_offset; g_scratch_offset += size; @@ -3704,6 +3722,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo void * data; CUDA_CHECK(cudaMalloc(&data, size)); CUDA_CHECK(cudaMemset(data, 0, size)); + extra = new ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); extra->data_device[g_main_device] = data; } From 7dabc66f3c63f8ea0f61bac346fa138e01df675f Mon Sep 17 00:00:00 2001 From: wzy <32936898+Freed-Wu@users.noreply.github.com> Date: Sat, 15 Jul 2023 03:05:08 +0800 Subject: [PATCH 11/22] make : use pkg-config for OpenBLAS (#2222) --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 0a4f0640d..6c74e1346 100644 --- a/Makefile +++ b/Makefile @@ -154,8 +154,8 @@ ifdef LLAMA_MPI endif # LLAMA_MPI ifdef LLAMA_OPENBLAS - CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas - LDFLAGS += -lopenblas + CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas) + LDFLAGS += $(shell pkg-config --libs openblas) endif # LLAMA_OPENBLAS ifdef LLAMA_BLIS From a6803cab946c817fb7aaf2a40b317f5d3e373bd1 Mon Sep 17 00:00:00 2001 From: Dave Della Costa Date: Fri, 14 Jul 2023 15:13:38 -0400 Subject: [PATCH 12/22] flake : add runHook preInstall/postInstall to installPhase so hooks function (#2224) --- flake.nix | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flake.nix b/flake.nix index cebb47b94..dda7ee7b7 100644 --- a/flake.nix +++ b/flake.nix @@ -43,6 +43,8 @@ "-DLLAMA_METAL=ON" ]); installPhase = '' + runHook preInstall + mkdir -p $out/bin mv bin/* $out/bin/ mv $out/bin/main $out/bin/llama @@ -51,6 +53,8 @@ echo "#!${llama-python}/bin/python" > $out/bin/convert.py cat ${./convert.py} >> $out/bin/convert.py chmod +x $out/bin/convert.py + + runHook postInstall ''; meta.mainProgram = "llama"; }; From 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Sat, 15 Jul 2023 06:34:16 -0400 Subject: [PATCH 13/22] llama : add custom RoPE (#2054) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Implement customizable RoPE The original RoPE has pre-defined parameters theta_i = 10000^(−2(i−1)/d), for i in [1, 2, ..., d/2] Our customizable RoPE, ggml_rope_custom_inplace, uses theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2] with the default matches the original scale = 1.0 base = 10000 The new command line arguments --rope-freq-base --rope-freq-scale set the two new RoPE parameter. Recent researches show changing these two parameters extends the context limit with minimal loss. 1. Extending Context to 8K kaiokendev https://kaiokendev.github.io/til#extending-context-to-8k 2. Extending Context Window of Large Language Models via Positional Interpolation Shouyuan Chen, Sherman Wong, Liangjian Chen, Yuandong Tian https://arxiv.org/abs/2306.15595 3. NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation. https://www.reddit.com/user/bloc97 https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ For the bold, try adding the following command line parameters to your favorite model: -c 16384 --rope-freq-base 80000 --rope-freq-scale 0.5 * ggml-metal: fix custom rope * common: fix argument names in help * llama: increase MEM_REQ_EVAL for MODEL_3B It avoids crashing for quantized weights on CPU. Better ways to calculate the required buffer size would be better. * llama: make MEM_REQ_EVAL depend on n_ctx * server: use proper Content-Type in curl examples Without the header Content-Type: application/json, curl will POST with Content-Type: application/x-www-form-urlencoded Though our simple server doesn't care, the httplib.h used has a limit with CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192 With Content-Type: application/json, we can send large json data. * style : minor fixes, mostly indentations * ggml : fix asserts --------- Co-authored-by: Georgi Gerganov --- examples/common.cpp | 16 ++++++++ examples/common.h | 2 + examples/main/main.cpp | 12 +++++- examples/server/README.md | 1 + examples/server/chat.sh | 2 + examples/server/server.cpp | 18 ++++++++ ggml-metal.m | 45 +++++++++++--------- ggml-metal.metal | 6 ++- ggml.c | 50 +++++++++++++++++------ ggml.h | 11 +++++ llama.cpp | 84 +++++++++++++++++++++++--------------- llama.h | 5 +++ 12 files changed, 185 insertions(+), 67 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 94875b054..8705127cb 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } else if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -493,6 +505,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); @@ -573,6 +587,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param lparams.use_mlock = params.use_mlock; lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; + lparams.rope_freq_base = params.rope_freq_base; + lparams.rope_freq_scale = params.rope_freq_scale; return lparams; } diff --git a/examples/common.h b/examples/common.h index 6315df961..f52fef629 100644 --- a/examples/common.h +++ b/examples/common.h @@ -32,6 +32,8 @@ struct gpt_params { int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + float rope_freq_base = 10000.0f; // RoPE base frequency + float rope_freq_scale = 1.0f; // RoPE frequency scaling factor // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 2248c2458..bcbcf12b0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -84,9 +84,17 @@ int main(int argc, char ** argv) { return 0; } + if (params.rope_freq_base != 10000.0) { + fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 1.0) { + fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); + } + if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" - "expect poor results\n", __func__, params.n_ctx); + fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);" + " you are on your own\n", __func__, params.n_ctx); } else if (params.n_ctx < 8) { fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; diff --git a/examples/server/README.md b/examples/server/README.md index ad9b6bb08..e5ca8269b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the ```sh curl --request POST \ --url http://localhost:8080/completion \ + --header "Content-Type: application/json" \ --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' ``` diff --git a/examples/server/chat.sh b/examples/server/chat.sh index a89f8e908..014360121 100644 --- a/examples/server/chat.sh +++ b/examples/server/chat.sh @@ -32,6 +32,7 @@ tokenize() { --silent \ --request POST \ --url "${API_URL}/tokenize" \ + --header "Content-Type: application/json" \ --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ | jq '.tokens[]' } @@ -64,6 +65,7 @@ chat_completion() { --no-buffer \ --request POST \ --url "${API_URL}/completion" \ + --header "Content-Type: application/json" \ --data-raw "${DATA}") printf "\n" diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 296c5d646..f442f2b56 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); @@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_ctx = std::stoi(argv[i]); } + else if (arg == "--rope-freq-base") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } + else if (arg == "--rope-freq-scale") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--memory-f32" || arg == "--memory_f32") { params.memory_f16 = false; diff --git a/ggml-metal.m b/ggml-metal.m index c795ee227..ee205bcdf 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -881,28 +881,35 @@ void ggml_metal_graph_compute( const int n_past = ((int32_t *)(src1->data))[0]; + float freq_base; + float freq_scale; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; - [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; + [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; + [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index f094a1d40..9f9a4fbd7 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -656,17 +656,19 @@ kernel void kernel_rope( constant int & n_past, constant int & n_dims, constant int & mode, + constant float & freq_base, + constant float & freq_scale, uint3 tpig[[thread_position_in_grid]]) { const int64_t i3 = tpig[2]; const int64_t i2 = tpig[1]; const int64_t i1 = tpig[0]; const bool is_neox = mode & 2; - const float theta_scale = pow(10000.0, -2.0f/n_dims); + const float theta_scale = pow(freq_base, -2.0f/n_dims); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); - float theta = (float)p; + float theta = freq_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { diff --git a/ggml.c b/ggml.c index 3ea8ba6ec..5ce1da0e9 100644 --- a/ggml.c +++ b/ggml.c @@ -6956,6 +6956,8 @@ struct ggml_tensor * ggml_rope_impl( int n_past, int n_dims, int mode, + float freq_base, + float freq_scale, int n_ctx, bool inplace) { GGML_ASSERT(n_past >= 0); @@ -6969,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl( ggml_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[2] = mode; ((int32_t *) b->data)[3] = n_ctx; + memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float)); + memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float)); ggml_scratch_load(ctx); @@ -6993,7 +6997,7 @@ struct ggml_tensor * ggml_rope( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false); } struct ggml_tensor * ggml_rope_inplace( @@ -7003,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true); +} + +struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float freq_base, + float freq_scale, + int n_ctx) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true); } // ggml_rope_back @@ -12074,16 +12090,21 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 4); + GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } + float freq_base; + float freq_scale; + const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; const int n_ctx = ((int32_t *) src1->data)[3]; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); assert(n_past >= 0); @@ -12112,7 +12133,7 @@ static void ggml_compute_forward_rope_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12124,7 +12145,7 @@ static void ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (is_glm) { theta = MIN(p, n_ctx - 2); @@ -12201,16 +12222,21 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 4); + GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } + float freq_base; + float freq_scale; + const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; const int n_ctx = ((int32_t *) src1->data)[3]; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); assert(n_past >= 0); @@ -12239,7 +12265,7 @@ static void ggml_compute_forward_rope_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12251,7 +12277,7 @@ static void ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (is_glm) { theta = MIN(p, n_ctx - 2); @@ -12312,7 +12338,7 @@ static void ggml_compute_forward_rope_f16( const float x0 = GGML_FP16_TO_FP32(src[0]); const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } @@ -15710,7 +15736,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); + assert(ggml_nelements(src1) == 6); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; @@ -15731,7 +15757,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { if (src0->grad) { assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); + assert(ggml_nelements(src1) == 3); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; diff --git a/ggml.h b/ggml.h index b88c35bae..24856a255 100644 --- a/ggml.h +++ b/ggml.h @@ -1121,6 +1121,17 @@ extern "C" { int mode, int n_ctx); + // custom RoPE, in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float freq_base, + float freq_scale, + int n_ctx); + // rotary position embedding backward, i.e compute dx from dy // a - dy GGML_API struct ggml_tensor * ggml_rope_back( diff --git a/llama.cpp b/llama.cpp index b0cd9417c..27e1ee964 100644 --- a/llama.cpp +++ b/llama.cpp @@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * // memory sizes // -static const std::map & MEM_REQ_SCRATCH0() +static const std::map & MEM_REQ_SCRATCH0(int n_ctx) { static std::map k_sizes = { - { MODEL_3B, 256ull * MB }, - { MODEL_7B, 512ull * MB }, - { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 1024ull * MB }, + /* empirical scaling, still a guess */ + { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB }, + { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB }, + { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB }, + { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB }, + { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB }, }; return k_sizes; } @@ -140,14 +141,14 @@ static const std::map & MEM_REQ_KV_SELF() // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled -static const std::map & MEM_REQ_EVAL() +static const std::map & MEM_REQ_EVAL(int n_ctx) { static std::map k_sizes = { - { MODEL_3B, 512ull * MB }, - { MODEL_7B, 768ull * MB }, - { MODEL_13B, 1024ull * MB }, - { MODEL_30B, 1280ull * MB }, - { MODEL_65B, 1536ull * MB }, + { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB }, + { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB }, + { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB }, + { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB }, + { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB }, }; return k_sizes; } @@ -189,6 +190,10 @@ struct llama_hparams { uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; + + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; + enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { @@ -647,7 +652,7 @@ struct llama_model_loader { *ctx_size_p = *mmapped_size_p = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; - *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16; } } @@ -843,6 +848,8 @@ struct llama_context_params llama_context_default_params() { /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ {0}, + /*.rope_freq_base =*/ 10000.0f, + /*.rope_freq_scale =*/ 1.0f, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, @@ -966,6 +973,8 @@ static void llama_model_load_internal( int n_gpu_layers, int main_gpu, const float * tensor_split, + float rope_freq_base, + float rope_freq_scale, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1000,22 +1009,27 @@ static void llama_model_load_internal( } hparams.n_ctx = n_ctx; + + hparams.rope_freq_base = rope_freq_base; + hparams.rope_freq_scale = rope_freq_scale; } const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); - fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { @@ -1164,9 +1178,9 @@ static void llama_model_load_internal( const size_t mem_required = ctx_size + mmapped_size - vram_weights + // weights in VRAM not in memory - MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at (model.type); + MEM_REQ_EVAL(hparams.n_ctx).at(model.type); // this is the memory required by one llama_state const size_t mem_required_state = @@ -1270,6 +1284,8 @@ static bool llama_model_load( int n_gpu_layers, int main_gpu, float * tensor_split, + float rope_freq_base, + float rope_freq_scale, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1278,7 +1294,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1330,6 +1346,9 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; const int n_gpu_layers = model.n_gpu_layers; + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1427,11 +1446,11 @@ static bool llama_eval_internal( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -2674,8 +2693,9 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, - params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, + memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, + params.progress_callback_user_data)) { delete model; fprintf(stderr, "%s: failed to load model\n", __func__); return nullptr; @@ -2750,9 +2770,9 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type)); - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); } diff --git a/llama.h b/llama.h index e7c60f40c..e744584f2 100644 --- a/llama.h +++ b/llama.h @@ -89,6 +89,11 @@ extern "C" { int32_t n_gpu_layers; // number of layers to store in VRAM int32_t main_gpu; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + + // ref: https://github.com/ggerganov/llama.cpp/pull/2054 + float rope_freq_base; // RoPE base frequency + float rope_freq_scale; // RoPE frequency scaling factor + // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; // context pointer passed to the progress callback From 27ab66e437797aedbb23b3599385756b6c26ac39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Podiv=C3=ADn?= <66251151+jpodivin@users.noreply.github.com> Date: Sun, 16 Jul 2023 21:54:47 +0200 Subject: [PATCH 14/22] py : turn verify-checksum-models.py into executable (#2245) README.md was adjusted to reflect the change. Signed-off-by: Jiri Podivin --- README.md | 2 +- scripts/verify-checksum-models.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) mode change 100644 => 100755 scripts/verify-checksum-models.py diff --git a/README.md b/README.md index 476cc438b..f45e4bf08 100644 --- a/README.md +++ b/README.md @@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t ```bash # run the verification script -python3 .\scripts\verify-checksum-models.py +./scripts/verify-checksum-models.py ``` - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory: diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py old mode 100644 new mode 100755 index d12748281..307b7c08d --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -1,3 +1,5 @@ +#!/bin/env python3 + import os import hashlib From 672dda10e4d8ac79df5d5970da7fb69d242ca9a7 Mon Sep 17 00:00:00 2001 From: Qingyou Meng Date: Mon, 17 Jul 2023 03:57:28 +0800 Subject: [PATCH 15/22] ggml : fixed runtime bugs and compile errors related to GGML_PERF and GGML_DEBUG (#2219) * fixed runtime bugs and compile errors related to GGML_PERF and GGML_DEBUG * remove ifdef GGML_PERF; update fmt --- ggml.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/ggml.c b/ggml.c index 5ce1da0e9..c56a3d0e0 100644 --- a/ggml.c +++ b/ggml.c @@ -4412,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) { if (&g_state.contexts[i].context == ctx) { g_state.contexts[i].used = false; - GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", - __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); + GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", + __func__, i, ggml_used_mem(ctx)); if (ctx->mem_buffer_owned) { GGML_ALIGNED_FREE(ctx->mem_buffer); @@ -16317,8 +16317,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (GGML_OP_HAS_FINALIZE[node->op]) { params.nth = n_tasks_arr[node_n]; ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); } + ggml_graph_compute_perf_stats_node(node, state->shared); } // distribute new work or execute it direct if 1T @@ -16348,8 +16348,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (GGML_OP_HAS_FINALIZE[node->op]) { params.type = GGML_TASK_FINALIZE; ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); } + + ggml_graph_compute_perf_stats_node(node, state->shared); } else { break; } @@ -16891,9 +16892,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char } void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { - //assert(cgraph->work == NULL); - //assert(cgraph->work_size == 0); - uint64_t size_eval = 0; // compute size of intermediate results @@ -17332,9 +17330,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT("=== GRAPH ===\n"); - GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); - GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); - GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; From b7647436ccc80970b44a270f70f4f2ea139054d1 Mon Sep 17 00:00:00 2001 From: Alex Klinkhamer Date: Sun, 16 Jul 2023 14:01:45 -0700 Subject: [PATCH 16/22] llama : fix t_start_sample_us initialization warning (#2238) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 27e1ee964..0f9d5346d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2205,7 +2205,7 @@ void llama_sample_classifier_free_guidance( struct llama_context * guidance_ctx, float scale, float smooth_factor) { - int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); + int64_t t_start_sample_us = ggml_time_us(); assert(ctx); auto n_vocab = llama_n_vocab(ctx); From 7568d1a2b206331412106ea66da3f871025e0c3c Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jul 2023 01:39:29 +0800 Subject: [PATCH 17/22] Support dup & cont ops on CUDA (#2242) --- ggml-cuda.cu | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0646fa7b2..d3054a7fa 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3537,6 +3537,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens (void) dst; } +void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_cpy(src0, dst, nullptr); + (void) src1; +} + void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); @@ -3670,7 +3675,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo // recursively assign CUDA buffers until a compute tensor is found if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { const ggml_op src0_op = tensor->src[0]->op; - if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) { + if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace); } } @@ -3776,6 +3781,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); switch (tensor->op) { + case GGML_OP_DUP: + if (!any_on_device) { + return false; + } + func = ggml_cuda_dup; + break; case GGML_OP_ADD: if (!any_on_device) { return false; @@ -3830,6 +3841,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func = ggml_cuda_cpy; break; + case GGML_OP_CONT: + if (!any_on_device) { + return false; + } + func = ggml_cuda_dup; + break; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: From 6cbf9dfb32f0e23ed3afd02d30ab066ed53e2c4d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 11:50:49 +0300 Subject: [PATCH 18/22] llama : shorten quantization descriptions --- examples/quantize/quantize.cpp | 114 ++++++--------------------------- 1 file changed, 19 insertions(+), 95 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 797d2f0c5..744f549c5 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -14,103 +14,27 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { - "Q4_0", - LLAMA_FTYPE_MOSTLY_Q4_0, - " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M", - }, - { - "Q4_1", - LLAMA_FTYPE_MOSTLY_Q4_1, - " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L", - }, - { - "Q5_0", - LLAMA_FTYPE_MOSTLY_Q5_0, - " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M", - }, - { - "Q5_1", - LLAMA_FTYPE_MOSTLY_Q5_1, - " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M", - }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", }, #ifdef GGML_USE_K_QUANTS - { - "Q2_K", - LLAMA_FTYPE_MOSTLY_Q2_K, - " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended", - }, - { - "Q3_K", - LLAMA_FTYPE_MOSTLY_Q3_K_M, - "alias for Q3_K_M" - }, - { - "Q3_K_S", - LLAMA_FTYPE_MOSTLY_Q3_K_S, - " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss", - }, - { - "Q3_K_M", - LLAMA_FTYPE_MOSTLY_Q3_K_M, - " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss", - }, - { - "Q3_K_L", - LLAMA_FTYPE_MOSTLY_Q3_K_L, - " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss", - }, - { - "Q4_K", - LLAMA_FTYPE_MOSTLY_Q4_K_M, - "alias for Q4_K_M", - }, - { - "Q4_K_S", - LLAMA_FTYPE_MOSTLY_Q4_K_S, - " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss", - }, - { - "Q4_K_M", - LLAMA_FTYPE_MOSTLY_Q4_K_M, - " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*", - }, - { - "Q5_K", - LLAMA_FTYPE_MOSTLY_Q5_K_M, - "alias for Q5_K_M", - }, - { - "Q5_K_S", - LLAMA_FTYPE_MOSTLY_Q5_K_S, - " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*", - }, - { - "Q5_K_M", - LLAMA_FTYPE_MOSTLY_Q5_K_M, - " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*", - }, - { - "Q6_K", - LLAMA_FTYPE_MOSTLY_Q6_K, - " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss", - }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", }, + { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", }, + { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", }, #endif - { - "Q8_0", - LLAMA_FTYPE_MOSTLY_Q8_0, - " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended", - }, - { - "F16", - LLAMA_FTYPE_MOSTLY_F16, - "13.00G @ 7B - extremely large, virtually no quality loss - not recommended", - }, - { - "F32", - LLAMA_FTYPE_ALL_F32, - "26.00G @ 7B - absolutely huge, lossless - not recommended", - }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, }; From d01bccde9f759b24449fdaa16306b406a50eb367 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 14:24:43 +0300 Subject: [PATCH 19/22] ci : integrate with ggml-org/ci (#2250) * ci : run ctest ggml-ci * ci : add open llama 3B-v2 tests ggml-ci * ci : disable wget progress output ggml-ci * ci : add open llama 3B-v2 tg tests for q4 and q5 quantizations ggml-ci * tests : try to fix tail free sampling test ggml-ci * ci : add K-quants ggml-ci * ci : add short perplexity tests ggml-ci * ci : add README.md * ppl : add --chunks argument to limit max number of chunks ggml-ci * ci : update README --- .gitignore | 5 +- ci/README.md | 20 +++ ci/run.sh | 262 +++++++++++++++++++++++++++++ examples/common.cpp | 7 + examples/common.h | 1 + examples/perplexity/perplexity.cpp | 6 +- llama.cpp | 15 +- tests/test-sampling.cpp | 2 + 8 files changed, 312 insertions(+), 6 deletions(-) create mode 100644 ci/README.md create mode 100644 ci/run.sh diff --git a/.gitignore b/.gitignore index faec869e0..a23ac5928 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ build/ build-em/ build-debug/ build-release/ +build-ci-debug/ +build-ci-release/ build-static/ build-cublas/ build-opencl/ @@ -25,9 +27,10 @@ build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ out/ +tmp/ models/* -*.bin +models-mnt /main /quantize diff --git a/ci/README.md b/ci/README.md new file mode 100644 index 000000000..6c74c8138 --- /dev/null +++ b/ci/README.md @@ -0,0 +1,20 @@ +# CI + +In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework: + +https://github.com/ggml-org/ci + +It monitors the `master` branch for new commits and runs the +[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us +to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled +to cover various hardware architectures, including GPU and Apple Silicon instances. + +Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message. +Only the branches of this repo are monitored for this keyword. + +It is a good practice, before publishing changes to execute the full CI locally on your machine: + +```bash +mkdir tmp +bash ./ci/run.sh ./tmp/results ./tmp/mnt +``` diff --git a/ci/run.sh b/ci/run.sh new file mode 100644 index 000000000..c823bc467 --- /dev/null +++ b/ci/run.sh @@ -0,0 +1,262 @@ +#/bin/bash + +if [ -z "$2" ]; then + echo "usage: $0 " + exit 1 +fi + +mkdir -p "$1" +mkdir -p "$2" + +OUT=$(realpath "$1") +MNT=$(realpath "$2") + +rm -v $OUT/*.log +rm -v $OUT/*.exit +rm -v $OUT/*.md + +sd=`dirname $0` +cd $sd/../ +SRC=`pwd` + +## helpers + +# download a file if it does not exist or if it is outdated +function gg_wget { + local out=$1 + local url=$2 + + local cwd=`pwd` + + mkdir -p $out + cd $out + + # should not re-download if file is the same + wget -nv -N $url + + cd $cwd +} + +function gg_printf { + printf -- "$@" >> $OUT/README.md +} + +function gg_run { + ci=$1 + + set -o pipefail + set -x + + gg_run_$ci | tee $OUT/$ci.log + cur=$? + echo "$cur" > $OUT/$ci.exit + + set +x + set +o pipefail + + gg_sum_$ci + + ret=$((ret | cur)) +} + +## ci + +# ctest_debug + +function gg_run_ctest_debug { + cd ${SRC} + + rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + + set +e +} + +function gg_sum_ctest_debug { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest in debug mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' + gg_printf '\n' +} + +# ctest_release + +function gg_run_ctest_release { + cd ${SRC} + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + if [ -z $GG_BUILD_LOW_PERF ]; then + (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log + else + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + fi + + set +e +} + +function gg_sum_ctest_release { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest in release mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + +# open_llama_3b_v2 + +function gg_run_open_llama_3b_v2 { + cd ${SRC} + + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json + + gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ + head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw + + path_models="../models-mnt/open-llama/3B-v2" + path_wiki="../models-mnt/wikitext/wikitext-2-raw" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert.py ${path_models} + + model_f16="${path_models}/ggml-model-f16.bin" + model_q8_0="${path_models}/ggml-model-q8_0.bin" + model_q4_0="${path_models}/ggml-model-q4_0.bin" + model_q4_1="${path_models}/ggml-model-q4_1.bin" + model_q5_0="${path_models}/ggml-model-q5_0.bin" + model_q5_1="${path_models}/ggml-model-q5_1.bin" + model_q3_k="${path_models}/ggml-model-q3_k.bin" + model_q4_k="${path_models}/ggml-model-q4_k.bin" + model_q5_k="${path_models}/ggml-model-q5_k.bin" + model_q6_k="${path_models}/ggml-model-q6_k.bin" + + wiki_test_60="${path_wiki}/wiki.test-60.raw" + + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/quantize ${model_f16} ${model_q3_k} q3_k + ./bin/quantize ${model_f16} ${model_q4_k} q4_k + ./bin/quantize ${model_f16} ${model_q5_k} q5_k + ./bin/quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + function check_ppl { + qnt="$1" + ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl" + return 20 + fi + + printf ' - %s @ %s OK\n' "$qnt" "$ppl" + return 0 + } + + check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + + set +e +} + +function gg_sum_open_llama_3b_v2 { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'OpenLLaMA 3B-v2:\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" + gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" + gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)" + gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)" + gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)" + gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)" + gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" + gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" + gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" +} + +## main + +if [ -z $GG_BUILD_LOW_PERF ]; then + rm -rf ${SRC}/models-mnt + + mnt_models=$(realpath ${MNT}/models) + mkdir -p ${mnt_models} + ln -sfn ${mnt_models} ${SRC}/models-mnt + + python3 -m pip install -r ${SRC}/requirements.txt +fi + +ret=0 + +#test $ret -eq 0 && gg_run ctest_debug +#test $ret -eq 0 && gg_run ctest_release + +if [ -z $GG_BUILD_LOW_PERF ]; then + test $ret -eq 0 && gg_run open_llama_3b_v2 +fi + +exit $ret diff --git a/examples/common.cpp b/examples/common.cpp index 8705127cb..fd6dbc0e3 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_keep = std::stoi(argv[i]); + } else if (arg == "--chunks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_chunks = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } diff --git a/examples/common.h b/examples/common.h index f52fef629..037a4eecb 100644 --- a/examples/common.h +++ b/examples/common.h @@ -28,6 +28,7 @@ struct gpt_params { int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 7e120ff12..bfad99939 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // BOS tokens will be added for each chunk before eval auto tokens = ::llama_tokenize(ctx, params.prompt, true); - int count = 0; + const int n_chunk_max = tokens.size() / params.n_ctx; - const int n_chunk = tokens.size() / params.n_ctx; + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_vocab = llama_n_vocab(ctx); const int n_batch = params.n_batch; + int count = 0; double nll = 0.0; + fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); for (int i = 0; i < n_chunk; ++i) { diff --git a/llama.cpp b/llama.cpp index 0f9d5346d..fa3b7c03c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2024,9 +2024,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * } // Normalize the second derivatives - float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); - for (float & value : second_derivatives) { - value /= second_derivatives_sum; + { + const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); + + if (second_derivatives_sum > 1e-6f) { + for (float & value : second_derivatives) { + value /= second_derivatives_sum; + } + } else { + for (float & value : second_derivatives) { + value = 1.0f / second_derivatives.size(); + } + } } float cum_sum = 0.0f; diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 64f9455d7..4437c3948 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -200,4 +200,6 @@ int main(void) { test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f); printf("OK\n"); + + return 0; } From b1f429095328a34556c0e9a7a2fefced3db3368c Mon Sep 17 00:00:00 2001 From: wzy <32936898+Freed-Wu@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:01:11 +0800 Subject: [PATCH 20/22] cmake : install targets (#2256) fix #2252 --- CMakeLists.txt | 25 +++++++++++++++++++ convert-lora-to-ggml.py | 1 + convert.py | 1 + examples/baby-llama/CMakeLists.txt | 1 + examples/benchmark/CMakeLists.txt | 1 + examples/embd-input/CMakeLists.txt | 2 ++ examples/embedding/CMakeLists.txt | 1 + examples/main/CMakeLists.txt | 1 + examples/metal/CMakeLists.txt | 1 + examples/perplexity/CMakeLists.txt | 1 + examples/quantize-stats/CMakeLists.txt | 1 + examples/quantize/CMakeLists.txt | 1 + examples/save-load-state/CMakeLists.txt | 1 + examples/server/CMakeLists.txt | 1 + examples/simple/CMakeLists.txt | 1 + .../train-text-from-scratch/CMakeLists.txt | 1 + tests/CMakeLists.txt | 1 + 17 files changed, 42 insertions(+) mode change 100644 => 100755 convert-lora-to-ggml.py mode change 100644 => 100755 convert.py diff --git a/CMakeLists.txt b/CMakeLists.txt index d9381dae1..abc96814d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -512,6 +512,7 @@ if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(ggml_shared SHARED $) target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) + install(TARGETS ggml_shared LIBRARY) endif() add_library(llama @@ -533,8 +534,32 @@ if (BUILD_SHARED_LIBS) if (LLAMA_METAL) set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") endif() + install(TARGETS llama LIBRARY) endif() +include(GNUInstallDirs) +install( + FILES convert.py + PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE + DESTINATION ${CMAKE_INSTALL_BINDIR}) +install( + FILES convert-lora-to-ggml.py + PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE + DESTINATION ${CMAKE_INSTALL_BINDIR}) # # programs, examples and tests diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py old mode 100644 new mode 100755 index f43c836f5..b4999ff5a --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import json import os import re diff --git a/convert.py b/convert.py old mode 100644 new mode 100755 index 7a2705e5c..e3f1096e1 --- a/convert.py +++ b/convert.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import argparse import concurrent.futures import copy diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt index d2ce36367..7b70227a5 100644 --- a/examples/baby-llama/CMakeLists.txt +++ b/examples/baby-llama/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET baby-llama) add_executable(${TARGET} baby-llama.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 037696194..3f3415350 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt index 2b623953e..5bbb1ea02 100644 --- a/examples/embd-input/CMakeLists.txt +++ b/examples/embd-input/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET embdinput) add_library(${TARGET} embd-input-lib.cpp embd-input.h) +install(TARGETS ${TARGET} LIBRARY) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) @@ -8,6 +9,7 @@ endif() set(TARGET embd-input-test) add_executable(${TARGET} embd-input-test.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index db73b6b44..0c752c7bb 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET embedding) add_executable(${TARGET} embedding.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index c364242fb..cc1888948 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET main) add_executable(${TARGET} main.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt index a8c4284a5..f16d49165 100644 --- a/examples/metal/CMakeLists.txt +++ b/examples/metal/CMakeLists.txt @@ -1,3 +1,4 @@ set(TEST_TARGET metal) add_executable(${TEST_TARGET} metal.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE ggml) diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 61b17b828..af00b4e16 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET perplexity) add_executable(${TARGET} perplexity.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index 7bebc11a1..c5c394058 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET quantize-stats) add_executable(${TARGET} quantize-stats.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 475fc8be8..47d0be72e 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index 08dbe5c2b..eadd13cdf 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET save-load-state) add_executable(${TARGET} save-load-state.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 07ba76ad3..812a24b09 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -2,6 +2,7 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) +install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 1568f7364..0ac9cb03a 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET simple) add_executable(${TARGET} simple.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt index 1a44c4961..4459516d0 100644 --- a/examples/train-text-from-scratch/CMakeLists.txt +++ b/examples/train-text-from-scratch/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET train-text-from-scratch) add_executable(${TARGET} train-text-from-scratch.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1acf050a7..11ec6c725 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,7 @@ function(llama_add_test source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_executable(${TEST_TARGET} ${source}) + install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE llama) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) endfunction() From 45a1b07e9b20c33d71d8c849ff27d693a75a0269 Mon Sep 17 00:00:00 2001 From: wzy <32936898+Freed-Wu@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:01:55 +0800 Subject: [PATCH 21/22] flake : update flake.nix (#2270) When `isx86_32 || isx86_64`, it will use mkl, else openblas According to https://discourse.nixos.org/t/rpath-of-binary-contains-a-forbidden-reference-to-build/12200/3, add -DCMAKE_SKIP_BUILD_RPATH=ON Fix #2261, Nix doesn't provide mkl-sdl.pc. When we build with -DBUILD_SHARED_LIBS=ON, -DLLAMA_BLAS_VENDOR=Intel10_lp64 replace mkl-sdl.pc by mkl-dynamic-lp64-iomp.pc --- CMakeLists.txt | 11 ++++++++- README.md | 2 +- flake.nix | 63 ++++++++++++++++++++++++++++---------------------- 3 files changed, 47 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index abc96814d..169332767 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,7 +186,16 @@ if (LLAMA_BLAS) pkg_check_modules(DepBLAS REQUIRED flexiblas_api) elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") # all Intel* libraries share the same include path - pkg_check_modules(DepBLAS REQUIRED mkl-sdl) + pkg_check_modules(DepBLAS mkl-sdl) + if (NOT DepBLAS) + if (BUILD_SHARED_LIBS) + set(LINK_METHOD dynamic) + else() + set(LINK_METHOD static) + endif() + string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR}) + pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp) + endif() elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") # this doesn't provide pkg-config # suggest to assign BLAS_INCLUDE_DIRS on your own diff --git a/README.md b/README.md index f45e4bf08..073b621e9 100644 --- a/README.md +++ b/README.md @@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements ```bash mkdir build cd build - cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake --build . --config Release ``` diff --git a/flake.nix b/flake.nix index dda7ee7b7..5657e8258 100644 --- a/flake.nix +++ b/flake.nix @@ -6,24 +6,27 @@ outputs = { self, nixpkgs, flake-utils }: flake-utils.lib.eachDefaultSystem (system: let - inherit (pkgs.stdenv) isAarch64 isDarwin; - inherit (pkgs.lib) optionals; - isM1 = isAarch64 && isDarwin; - osSpecific = if isM1 then - with pkgs.darwin.apple_sdk_11_0.frameworks; [ - Accelerate - MetalKit - MetalPerformanceShaders - MetalPerformanceShadersGraph - ] - else if isDarwin then - with pkgs.darwin.apple_sdk.frameworks; [ - Accelerate - CoreGraphics - CoreVideo - ] - else - [ ]; + inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin; + osSpecific = with pkgs; [ openmpi ] ++ + ( + if isAarch64 && isDarwin then + with pkgs.darwin.apple_sdk_11_0.frameworks; [ + Accelerate + MetalKit + MetalPerformanceShaders + MetalPerformanceShadersGraph + ] + else if isAarch32 && isDarwin then + with pkgs.darwin.apple_sdk.frameworks; [ + Accelerate + CoreGraphics + CoreVideo + ] + else if isx86_32 || isx86_64 then + with pkgs; [ mkl ] + else + with pkgs; [ openblas ] + ); pkgs = import nixpkgs { inherit system; }; llama-python = pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]); @@ -31,22 +34,28 @@ packages.default = pkgs.stdenv.mkDerivation { name = "llama.cpp"; src = ./.; - postPatch = if isM1 then '' + postPatch = '' substituteInPlace ./ggml-metal.m \ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - '' else - ""; - nativeBuildInputs = with pkgs; [ cmake ]; + ''; + nativeBuildInputs = with pkgs; [ cmake pkgconfig ]; buildInputs = osSpecific; - cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [ - "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" - "-DLLAMA_METAL=ON" + cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ] + ++ (if isAarch64 && isDarwin then [ + "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" + "-DLLAMA_METAL=ON" + ] else if isx86_32 || isx86_64 then [ + "-DLLAMA_BLAS=ON" + "-DLLAMA_BLAS_VENDOR=Intel10_lp64" + ] else [ + "-DLLAMA_BLAS=ON" + "-DLLAMA_BLAS_VENDOR=OpenBLAS" ]); installPhase = '' runHook preInstall - mkdir -p $out/bin - mv bin/* $out/bin/ + install -D bin/* -t $out/bin + install -Dm644 lib*.so -t $out/lib mv $out/bin/main $out/bin/llama mv $out/bin/server $out/bin/llama-server From 294f424554c1599784ac9962462fc39ace92d8a5 Mon Sep 17 00:00:00 2001 From: Rinne Date: Wed, 19 Jul 2023 15:06:40 +0800 Subject: [PATCH 22/22] llama : extend API to get max devices at runtime (#2253) --- llama.cpp | 4 ++++ llama.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/llama.cpp b/llama.cpp index fa3b7c03c..3319b7023 100644 --- a/llama.cpp +++ b/llama.cpp @@ -875,6 +875,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { return result; } +int llama_max_devices() { + return LLAMA_MAX_DEVICES; +} + bool llama_mmap_supported() { return llama_mmap::SUPPORTED; } diff --git a/llama.h b/llama.h index e744584f2..b676a383b 100644 --- a/llama.h +++ b/llama.h @@ -153,6 +153,8 @@ extern "C" { int32_t n_eval; }; + LLAMA_API int llama_max_devices(); + LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();