clean code 2

2025-01-13 05:42:22 +01:00 · 2024-06-09 21:15:02 +08:00 · 2024-06-09 21:15:02 +08:00 · 3a0f8b0697
commit 3a0f8b0697
parent 1c5a8b7fec
5 changed files with 100 additions and 51 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -2805,7 +2805,7 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "i2", "auto"], default="f16",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
@ -2865,7 +2865,6 @@ def main() -> None:
        "f16": gguf.LlamaFileType.MOSTLY_F16,
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
        "i2" :  gguf.LlamaFileType.MOSTLY_I2,
        "auto": gguf.LlamaFileType.GUESSED,
    }
--- a/ggml.c
+++ b/ggml.c
@ -2724,7 +2724,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS",
    "CROSS_ENTROPY_LOSS_BACK",
 };
 static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
@ -2813,7 +2812,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss(x,y)",
    "cross_entropy_loss_back(x,y)",
 };
 static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
@ -3078,10 +3076,9 @@ GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
        }
-        if(tensor->type == 31){
+        if(tensor->type == GGML_TYPE_I2_S){
            nbytes = nbytes / 4 + 32;
        }
    }
    else {
        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
@ -3107,6 +3104,7 @@ GGML_CALL size_t ggml_type_size(enum ggml_type type) {
 GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
    assert(ne % ggml_blck_size(type) == 0);
    if (type == GGML_TYPE_I2_S) ne /= 4;
    return ggml_type_size(type)*ne/ggml_blck_size(type);
 }
@ -12333,11 +12331,11 @@ static void ggml_compute_forward_mul_mat_one_chunk(
        return;
    }
-    void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    size_t row_size = ggml_row_size(vec_dot_type, ne10);
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-    if (src0->type == 31) {
+    // if (src0->type == 31) {
-        row_size = ne10;
+    //     row_size = ne10;
-    }
+    // }
    assert(ne12 % ne02 == 0);
    assert(ne13 % ne03 == 0);
@ -12351,9 +12349,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
    // attempt to reduce false-sharing (does not seem to make a difference)
    // 16 * 2, accounting for mmla kernels
    float tmp[32];
-    uint8_t *i_weight = (uint8_t*) (src0->data);
+    float * scale = (float * )((uint8_t*) (src0->data) + (ne00 * ne01 / 4));
-    float * scale = (float * )((i_weight) + (ne00 * ne01 / 4));
+    const float * act_scales = (const float*) ((const char *) wdata + (ne11 * ne10));
    float * act_scales = (float*) ((char *) wdata + (ne11 * ne10));
    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
@ -12380,7 +12377,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(
                    (src1_cont || src1->type != vec_dot_type
                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
@ -12388,13 +12384,12 @@ static void ggml_compute_forward_mul_mat_one_chunk(
                //}
                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-                if (src0->type == GGML_TYPE_I2_S) {
+                    if (src0->type == GGML_TYPE_I2_S) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01 / 4, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                        vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01 / 4, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
-                    tmp[ir0 - iir0] = tmp[ir0 - iir0] / (act_scales[i11]) * (*scale);
+                        tmp[ir0 - iir0] = tmp[ir0 - iir0] / (act_scales[i11]) * (*scale);
-                } else {
+                    } else {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                        vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
-                }
+                    }
                }
                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -925,7 +925,6 @@ class GGMLQuantizationType(IntEnum):
    F64     = 28
    IQ1_M   = 29
    BF16    = 30
    I2      = 31
 # TODO: add GGMLFileType from ggml_ftype in ggml.h
@ -967,7 +966,6 @@ class LlamaFileType(IntEnum):
    MOSTLY_IQ4_XS        = 30  # except 1d tensors
    MOSTLY_IQ1_M         = 31  # except 1d tensors
    MOSTLY_BF16          = 32  # except 1d tensors
    MOSTLY_I2            = 33  # except 1d tensors
    GUESSED              = 1024  # not specified in the model file
@ -1034,7 +1032,6 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.IQ3_S:   (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
    GGMLQuantizationType.IQ2_S:   (256, 2 + QK_K // 4 + QK_K // 16),
    GGMLQuantizationType.IQ4_XS:  (256, 2 + 2 + QK_K // 2 + QK_K // 64),
    GGMLQuantizationType.I2:      (1, 1),
    GGMLQuantizationType.I8:      (1, 1),
    GGMLQuantizationType.I16:     (1, 2),
    GGMLQuantizationType.I32:     (1, 4),
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -225,10 +225,8 @@ class GGUFWriter:
                dtype = GGMLQuantizationType.I32
            elif tensor_dtype == np.int64:
                dtype = GGMLQuantizationType.I64
            elif tensor_dtype == np.uint8:
                dtype = GGMLQuantizationType.I2
            else:
-                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64, I2 tensors are supported for now")
+                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
        else:
            dtype = raw_dtype
            if tensor_dtype == np.uint8:
@ -239,10 +237,7 @@ class GGUFWriter:
            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        self.ti_data += self._pack("I", dtype)
        self.ti_data += self._pack("Q", self.offset_tensor)
-        if dtype == GGMLQuantizationType.I2:
+        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
            self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) + self.data_alignment
        else:
            self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1
    def add_tensor(
@ -257,9 +252,7 @@ class GGUFWriter:
            self.temp_file = fp
        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
-
+        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
        if (raw_dtype != GGMLQuantizationType.F32 or not name.endswith("scale")):
            self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
        if self.temp_file is None:
            self.tensors.append(tensor)
--- a/llama.cpp
+++ b/llama.cpp
@ -262,7 +262,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_OLMO,            "olmo"         },
    { LLM_ARCH_ARCTIC,          "arctic"       },
    { LLM_ARCH_DEEPSEEK2,       "deepseek2"    },
-    { LLM_ARCH_BITNET,          "bitnet"     },
+    { LLM_ARCH_BITNET,          "bitnet"       },
    { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
@ -3193,6 +3193,7 @@ struct llama_model_loader {
        llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
            const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
            if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
            }
@ -7029,7 +7030,6 @@ static struct ggml_tensor * llm_build_kqv(
         struct ggml_tensor * wo_b,
         struct ggml_tensor * q_cur,
         struct ggml_tensor * kq_mask,
         struct ggml_tensor * attn_sub_norm,
                    int32_t   n_tokens,
                    int32_t   n_kv,
                    float     kq_scale,
@ -7124,15 +7124,6 @@ static struct ggml_tensor * llm_build_kqv(
        cb(cur, "kqv_merged_cont", il);
    }
    if (model.arch == LLM_ARCH_BITNET)
    {
        cur = llm_build_norm(ctx, cur, hparams,
                        attn_sub_norm, NULL,
                        LLM_NORM_RMS, cb, il);
        cb(cur, "attn_sub_norm", il);
    }
    ggml_build_forward_expand(graph, cur);
    cur = ggml_mul_mat(ctx, wo, cur);
@ -7178,7 +7169,7 @@ static struct ggml_tensor * llm_build_kv(
    struct ggml_tensor * cur;
    cur  = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
-            q_cur, kq_mask, nullptr, n_tokens, n_kv, kq_scale, cb, il);
+            q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
    cb(cur, "kqv_out", il);
    return cur;
@ -11590,10 +11581,84 @@ struct llm_build_context {
                cb(Kcur, "Kcur", il);
                llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, model, hparams, cparams, kv_self, gf,
+
-                        model.layers[il].wo, model.layers[il].bo, 
+                const int64_t n_ctx                 = cparams.n_ctx;
-                        Qcur, KQ_mask, model.layers[il].attn_sub_norm, n_tokens, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il
+                const int64_t n_head                = hparams.n_head;
-                        );
+                const int64_t n_head_kv             = hparams.n_head_kv;
                const int64_t n_embd_head_k         = hparams.n_embd_head_k;
                const int64_t n_embd_k_gqa          = hparams.n_embd_k_gqa();
                const int64_t n_embd_head_v         = hparams.n_embd_head_v;
                const int64_t n_embd_v_gqa          = hparams.n_embd_v_gqa();
                struct ggml_tensor *        q_cur   = Qcur;
                struct ggml_tensor *        kq_mask = KQ_mask;
                float                      kq_scale = 1.0f/sqrtf(float(n_embd_head));
                struct ggml_tensor *  attn_sub_norm = model.layers[il].attn_sub_norm;
                struct ggml_cgraph *          graph = gf;
                struct ggml_tensor *             wo = model.layers[il].wo;
                struct ggml_tensor *       cur_attn;
                struct ggml_tensor *              q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
                cb(q, "q", il);
                struct ggml_tensor * k =
                    ggml_view_3d(ctx0, kv_self.k_l[il],
                            n_embd_head_k, n_kv, n_head_kv,
                            ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
                            ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
                            0);
                cb(k, "k", il);
                if (cparams.flash_attn) {
                    // split cached v into n_head heads (not transposed)
                    struct ggml_tensor * v =
                        ggml_view_3d(ctx0, kv_self.v_l[il],
                                n_embd_head_v, n_kv, n_head_kv,
                                ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
                                ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v),
                                0);
                    cb(v, "v", il);
                    cur_attn = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
                    cur_attn = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
                } else {
                    struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
                    cb(kq, "kq", il);
                    kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
                    cb(kq, "kq_soft_max_ext", il);
                    GGML_ASSERT(kv_self.size == n_ctx);
                    // split cached v into n_head heads
                    struct ggml_tensor * v =
                        ggml_view_3d(ctx0, kv_self.v_l[il],
                                n_kv, n_embd_head_v, n_head_kv,
                                ggml_element_size(kv_self.v_l[il])*n_ctx,
                                ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
                                0);
                    cb(v, "v", il);
                    struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
                    cb(kqv, "kqv", il);
                    struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
                    cb(kqv_merged, "kqv_merged", il);
                    cur_attn = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
                    cb(cur_attn, "kqv_merged_cont", il);
                }
                cur_attn = llm_build_norm(ctx0, cur_attn, hparams,
                        attn_sub_norm, NULL,
                        LLM_NORM_RMS, cb, il);
                cb(cur_attn, "attn_sub_norm", il);
                ggml_build_forward_expand(graph, cur_attn);
                cur = ggml_mul_mat(ctx0, wo, cur_attn);
                cb(cur, "kqv_out", il);
            }