remove unsed

2025-01-13 05:42:22 +01:00 · 2024-06-07 18:29:59 +08:00 · 2024-06-07 18:29:59 +08:00 · 2a01a7ce0d
commit 2a01a7ce0d
parent 5e59660173
3 changed files with 38 additions and 25 deletions
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -3729,9 +3729,6 @@ static inline __m128i get_scale_shuffle(int i) {
 //====================================== I2 ===============================================
 void ggml_vec_dot_i2_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const uint8_t *    restrict x = vx;
    const int8_t  *    restrict y = vy;
--- a/ggml.c
+++ b/ggml.c
@ -1814,7 +1814,6 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 inline static void ggml_vec_mul_f32_bitnet (const int n, float * y, const float x) { for (int i = 0; i < n; ++i) y[i]  = y[i] * x;   }
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
   assert(nrc == 1);
@ -12434,7 +12433,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
        return;
    }
-    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
    size_t row_size = ggml_row_size(vec_dot_type, ne10);
    if (src0->type == 31) {
        row_size = ne10;
@ -12454,7 +12453,17 @@ static void ggml_compute_forward_mul_mat_one_chunk(
    float tmp[32];
    uint8_t *i_weight = (uint8_t*) (src0->data);
    float * scale = (float * )((i_weight) + (ne00 * ne01 / 4));
-    float* act_scales = (float*) ((char *) wdata + ((ne11*nb11) / 4));
+    float * act_scales = (float*) ((char *) wdata + ((ne11*nb11) / 4));
        // printf("src0->name:%s\n", src0->name);
        // printf("src1->name:%s\n", src1->name);
        // printf("ne03:%ld\n", ne03);
        // printf("ne02:%ld\n", ne02);
        // printf("ne01:%ld\n", ne01);
        // printf("ne00:%ld\n", ne00);
        // printf("ne13:%ld\n", ne13);
        // printf("ne12:%ld\n", ne12);
        // printf("ne11:%ld\n", ne11);
        // printf("ne10:%ld\n", ne10);
    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
@ -12472,7 +12481,9 @@ static void ggml_compute_forward_mul_mat_one_chunk(
                const int64_t i3 = i13;
                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
-
+                // if (src0->type == 31) {
                // printf("src0->%ld\n", (0 + i02 * nb02 + i03 * nb03));
                // }
                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
                //       the original src1 data pointer, so we should index using the indices directly
@ -12481,22 +12492,29 @@ static void ggml_compute_forward_mul_mat_one_chunk(
                    (src1_cont || src1->type != vec_dot_type
                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
                // if (src0->type == 31) {
                // printf("src1->%ld\n", (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size);
                // }
                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
                //}
-
+                // if (src0->type == 31) {
                // printf("dst->%ld\n", (i1 * nb1 + i2 * nb2 + i3 * nb3));
                // }
                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
                if (src0->type == 31) {
                    // printf("row->%ld\n", (ir0 * nb01 / 4));
                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01 / 4, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
-                    tmp[ir0 - iir0] = tmp[ir0 - iir0] * (*scale) * (act_scales[i11]);
+                    tmp[ir0 - iir0] = tmp[ir0 - iir0] / (act_scales[i11]) * (*scale);
-                }else {
+                } else {
                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                }
                }
-
+                // printf("num_rows_per_vec_dot->%ld\n", num_rows_per_vec_dot);
                // printf("iir0->%ld\n", iir0);
                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
                }
@ -12552,23 +12570,23 @@ static void ggml_compute_forward_bitnet_mul_mat(
        if (ith != 0) {
            return;
        }
-    atomic_store(&state->shared->current_chunk, nth);
+        atomic_store(&state->shared->current_chunk, nth);
-    char * wdata = params->wdata;
+        char * wdata = params->wdata;
-    float* act_scales = (float*) ((char *) wdata + ((ne11*nb11) / 4));
+        float* act_scales = (float*) ((char *) wdata + ((ne11*nb11) / 4));
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        for (int64_t i13 = 0; i13 < ne13; i13++) {
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            for (int64_t i12 = 0; i12 < ne12; i12++) {
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                for (int64_t i11 = 0; i11 < ne11; i11++) {
-                float rowmax = 0.00001;
+                    float rowmax = 0.00001;
-                ggml_vec_absmaxclamp_f32(ne10, &rowmax, (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13), 0.00001);
+                    ggml_vec_absmaxclamp_f32(ne10, &rowmax, (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13), 0.00001);
-                float s = 127 / rowmax;
+                    float s = 127 / rowmax;
-                act_scales[i11] = 1/s;
+                    act_scales[i11] = s;
-                ggml_vec_scaleroundclamp_f32_v2(ne10,
+                    ggml_vec_scaleroundclamp_f32_v2(ne10,
                        (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13),
                        (int8_t*) ((char *) wdata + ((i11*nb11 + i12*nb12 + i13*nb13) / 4)),
                        s, -128, 127);
                }
            }
        }
    }
        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
        // atomic_store(&state->shared->current_chunk, nth);
        //     // char * wdata = params->wdata;
--- a/llama.cpp
+++ b/llama.cpp
@ -3192,9 +3192,7 @@ struct llama_model_loader {
        llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
            const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
            printf("name:%s\n", name);
            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
            printf("offs:%ld\n", offs + ggml_nbytes(tensor));
            if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
            }