ggml : importance matrix support for legacy quants (#4969)

* imatrix: adding support for legacy quants * imatrix: guard Q4_0/Q5_0 against ffn_down craziness --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2024-12-24 13:28:50 +01:00 · 2024-01-16 19:51:26 +02:00 · 2024-01-16 19:51:26 +02:00 · 334a835a1c
commit 334a835a1c
parent 4feb4b33ee
4 changed files with 226 additions and 8 deletions
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -515,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    quantize_row_q4_0_reference(x, y, k);
 }
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
    const int qk = QK4_1;
@ -3039,6 +3040,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
    return nrow * row_size;
 }
 static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
    static_assert(QK4_0 == 32, "QK4_0 must be 32");
    if (!quant_weights) {
        quantize_row_q4_0_reference(x, y, n_per_row);
        return;
    }
    float weight[QK4_0];
    int8_t L[QK4_0];
    float sum_x2 = 0;
    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
    float sigma2 = sum_x2/n_per_row;
    const int nb = n_per_row/QK4_0;
    for (int ib = 0; ib < nb; ++ib) {
        const float * xb = x + QK4_0 * ib;
        const float * qw = quant_weights + QK4_0 * ib;
        for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
        float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
        y[ib].d = GGML_FP32_TO_FP16(d);
        for (int j = 0; j < 16; ++j) {
            y[ib].qs[j] = L[j] | (L[j+16] << 4);
        }
    }
 }
 size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    if (!quant_weights) {
        return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
    }
    int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += row_size;
    }
    return nrow * row_size;
 }
 static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
    static_assert(QK4_1 == 32, "QK4_1 must be 32");
    if (!quant_weights) {
        quantize_row_q4_1_reference(x, y, n_per_row);
        return;
    }
    float weight[QK4_1];
    uint8_t L[QK4_1], Laux[QK4_1];
    float sum_x2 = 0;
    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
    float sigma2 = sum_x2/n_per_row;
    const int nb = n_per_row/QK4_1;
    for (int ib = 0; ib < nb; ++ib) {
        const float * xb = x + QK4_1 * ib;
        const float * qw = quant_weights + QK4_1 * ib;
        for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
        float min;
        float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
        y[ib].d = GGML_FP32_TO_FP16(d);
        y[ib].m = GGML_FP32_TO_FP16(-min);
        for (int j = 0; j < 16; ++j) {
            y[ib].qs[j] = L[j] | (L[j+16] << 4);
        }
    }
 }
 size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    if (!quant_weights) {
        return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
    }
    int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += row_size;
    }
    return nrow * row_size;
 }
 static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
    static_assert(QK5_0 == 32, "QK5_0 must be 32");
    if (!quant_weights) {
        quantize_row_q5_0_reference(x, y, n_per_row);
        return;
    }
    float weight[QK5_0];
    int8_t L[QK5_0];
    float sum_x2 = 0;
    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
    float sigma2 = sum_x2/n_per_row;
    const int nb = n_per_row/QK5_0;
    for (int ib = 0; ib < nb; ++ib) {
        const float * xb = x + QK5_0 * ib;
        const float * qw = quant_weights + QK5_0 * ib;
        for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
        float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
        y[ib].d = GGML_FP32_TO_FP16(d);
        uint32_t qh = 0;
        for (int j = 0; j < 16; ++j) {
            const uint8_t xi0 = L[j];
            const uint8_t xi1 = L[j+16];
            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
            // get the 5-th bit and store it in qh at the right position
            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
        }
        memcpy(&y[ib].qh, &qh, sizeof(qh));
    }
 }
 size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    if (!quant_weights) {
        return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
    }
    int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += row_size;
    }
    return nrow * row_size;
 }
 static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
    static_assert(QK5_1 == 32, "QK5_1 must be 32");
    if (!quant_weights) {
        quantize_row_q5_1_reference(x, y, n_per_row);
        return;
    }
    float weight[QK5_1];
    uint8_t L[QK5_1], Laux[QK5_1];
    float sum_x2 = 0;
    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
    float sigma2 = sum_x2/n_per_row;
    const int nb = n_per_row/QK5_1;
    for (int ib = 0; ib < nb; ++ib) {
        const float * xb = x + QK5_1 * ib;
        const float * qw = quant_weights + QK5_1 * ib;
        for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
        float min;
        float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
        y[ib].d = GGML_FP32_TO_FP16(d);
        y[ib].m = GGML_FP32_TO_FP16(-min);
        uint32_t qh = 0;
        for (int j = 0; j < 16; ++j) {
            const uint8_t xi0 = L[j];
            const uint8_t xi1 = L[j+16];
            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
            // get the 5-th bit and store it in qh at the right position
            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
        }
        memcpy(&y[ib].qh, &qh, sizeof(qh));
    }
 }
 size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    if (!quant_weights) {
        return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
    }
    int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += row_size;
    }
    return nrow * row_size;
 }
 // ====================== "True" 2-bit (de)-quantization
 static const  uint64_t iq2xxs_grid[256] = {
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -253,3 +253,7 @@ size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row,
 size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q6_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
--- a/ggml.c
+++ b/ggml.c
@ -18674,26 +18674,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
        case GGML_TYPE_Q4_0:
            {
                GGML_ASSERT(start % QK4_0 == 0);
-                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
+                GGML_ASSERT(start % n_per_row == 0);
-                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
+                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_Q4_1:
            {
                GGML_ASSERT(start % QK4_1 == 0);
-                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
+                GGML_ASSERT(start % n_per_row == 0);
-                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
+                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_Q5_0:
            {
                GGML_ASSERT(start % QK5_0 == 0);
-                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
+                GGML_ASSERT(start % n_per_row == 0);
-                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
+                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_Q5_1:
            {
                GGML_ASSERT(start % QK5_1 == 0);
-                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
+                GGML_ASSERT(start % n_per_row == 0);
-                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
+                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_Q8_0:
            {
--- a/llama.cpp
+++ b/llama.cpp
@ -8374,6 +8374,8 @@ struct quantize_state_internal {
    int n_k_quantized     = 0;
    int n_fallback        = 0;
    bool has_imatrix      = false;
    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
        : model(model)
        , params(params)
@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
            new_type = GGML_TYPE_Q5_K;
        }
        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
                && qs.has_imatrix && i_layer < n_layer/8) {
            // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
            // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
            // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
            new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
        }
        ++qs.i_feed_forward_w2;
    } else if (name.find("attn_output.weight") != std::string::npos) {
        if (arch != LLM_ARCH_FALCON) {
@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
        if (imatrix_data) {
            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
            qs.has_imatrix = true;
        }
    }