diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 1461bc963..e0cb718b2 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -33,19 +33,43 @@ class IMatrixCollector { public: IMatrixCollector() = default; void set_parameters(StatParams&& params) { m_params = std::move(params); } - void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1); + bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix() const; private: std::unordered_map m_stats; StatParams m_params; std::mutex m_mutex; int m_last_call = 0; + std::vector m_src1_data; }; -void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { - if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return; - if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return; +bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + GGML_UNUSED(user_data); + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + // when ask is true, the scheduler wants to know if we are interested in data from this tensor + // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection + if (ask) { + if (t->op != GGML_OP_MUL_MAT) return false; + if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; + if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; + return true; + } + std::lock_guard lock(m_mutex); + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(src1->buffer); + + if (!is_host || !ggml_is_contiguous(src1)) { + m_src1_data.resize(ggml_nelements(src1)); + ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1)); + } + + const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); + auto& e = m_stats[src0->name]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); @@ -59,7 +83,7 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { - const float * x = (const float *)src1->data + row * src1->ne[0]; + const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[j] += x[j]*x[j]; } @@ -70,6 +94,8 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st save_imatrix(); } } + + return true; } void IMatrixCollector::save_imatrix() const { @@ -93,8 +119,8 @@ void IMatrixCollector::save_imatrix() const { static IMatrixCollector g_collector; -static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { - g_collector.collect_imatrix(src0, src1); +static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + return g_collector.collect_imatrix(t, ask, user_data); } @@ -320,8 +346,6 @@ int main(int argc, char ** argv) { g_collector.set_parameters(std::move(sparams)); - ggml_set_imatrix_collection(ik_collect_imatrix); - params.logits_all = true; params.n_batch = std::min(params.n_batch, params.n_ctx); @@ -340,16 +364,27 @@ int main(int argc, char ** argv) { llama_backend_init(params.numa); - llama_model * model; - llama_context * ctx; + llama_model_params mparams = llama_model_params_from_gpt_params(params); - // load the model and apply lora adapter, if any - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } + llama_context_params cparams = llama_context_params_from_gpt_params(params); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + cparams.cb_eval = ik_collect_imatrix; + cparams.cb_eval_user_data = NULL; + + llama_context * ctx = llama_new_context_with_model(model, cparams); + if (ctx == NULL) { + fprintf(stderr, "%s: error: unable to create context\n", __func__); + return 1; + } + const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", diff --git a/ggml.c b/ggml.c index ef5888ab2..53197d477 100644 --- a/ggml.c +++ b/ggml.c @@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); -ggml_collect_imatrix_t g_imatrix_collect = NULL; - -void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) { - g_imatrix_collect = imatrix_collect; -} - static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", @@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; - if (ith == 1 && g_imatrix_collect) { - g_imatrix_collect(src0, src1); - } - const enum ggml_type type = src0->type; const bool src1_cont = ggml_is_contiguous(src1); @@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id( const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; - if (ith == 1 && g_imatrix_collect) { - g_imatrix_collect(src0_cur, src1); - } - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); diff --git a/ggml.h b/ggml.h index 1187074f7..4a2746729 100644 --- a/ggml.h +++ b/ggml.h @@ -2075,12 +2075,6 @@ extern "C" { GGML_API void ggml_init_iq2_quantization(enum ggml_type type); GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type); - // - // Importance matrix - // - typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1); - GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect); - // // gguf //