mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 00:39:00 +01:00
imatrix : offload to GPU support
This commit is contained in:
parent
e0493800ce
commit
0b2fca9a9f
@ -33,19 +33,43 @@ class IMatrixCollector {
|
|||||||
public:
|
public:
|
||||||
IMatrixCollector() = default;
|
IMatrixCollector() = default;
|
||||||
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
||||||
void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
void save_imatrix() const;
|
void save_imatrix() const;
|
||||||
private:
|
private:
|
||||||
std::unordered_map<std::string, Stats> m_stats;
|
std::unordered_map<std::string, Stats> m_stats;
|
||||||
StatParams m_params;
|
StatParams m_params;
|
||||||
std::mutex m_mutex;
|
std::mutex m_mutex;
|
||||||
int m_last_call = 0;
|
int m_last_call = 0;
|
||||||
|
std::vector<float> m_src1_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
|
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
|
GGML_UNUSED(user_data);
|
||||||
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
|
|
||||||
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
|
|
||||||
|
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||||
|
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
||||||
|
if (ask) {
|
||||||
|
if (t->op != GGML_OP_MUL_MAT) return false;
|
||||||
|
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
||||||
|
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
std::lock_guard<std::mutex> lock(m_mutex);
|
std::lock_guard<std::mutex> lock(m_mutex);
|
||||||
|
|
||||||
|
// copy the data from the GPU memory if needed
|
||||||
|
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
|
||||||
|
|
||||||
|
if (!is_host || !ggml_is_contiguous(src1)) {
|
||||||
|
m_src1_data.resize(ggml_nelements(src1));
|
||||||
|
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
|
||||||
|
}
|
||||||
|
|
||||||
|
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
||||||
|
|
||||||
auto& e = m_stats[src0->name];
|
auto& e = m_stats[src0->name];
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0], 0);
|
e.values.resize(src1->ne[0], 0);
|
||||||
@ -59,7 +83,7 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st
|
|||||||
printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
|
printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
|
||||||
}
|
}
|
||||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
const float * x = (const float *)src1->data + row * src1->ne[0];
|
const float * x = data + row * src1->ne[0];
|
||||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||||
e.values[j] += x[j]*x[j];
|
e.values[j] += x[j]*x[j];
|
||||||
}
|
}
|
||||||
@ -70,6 +94,8 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st
|
|||||||
save_imatrix();
|
save_imatrix();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void IMatrixCollector::save_imatrix() const {
|
void IMatrixCollector::save_imatrix() const {
|
||||||
@ -93,8 +119,8 @@ void IMatrixCollector::save_imatrix() const {
|
|||||||
|
|
||||||
static IMatrixCollector g_collector;
|
static IMatrixCollector g_collector;
|
||||||
|
|
||||||
static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
|
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
g_collector.collect_imatrix(src0, src1);
|
return g_collector.collect_imatrix(t, ask, user_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -320,8 +346,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
g_collector.set_parameters(std::move(sparams));
|
g_collector.set_parameters(std::move(sparams));
|
||||||
|
|
||||||
ggml_set_imatrix_collection(ik_collect_imatrix);
|
|
||||||
|
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
@ -340,16 +364,27 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_init(params.numa);
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
|
// pass the callback to the backend scheduler
|
||||||
|
// it will be executed for each node during the graph computation
|
||||||
|
cparams.cb_eval = ik_collect_imatrix;
|
||||||
|
cparams.cb_eval_user_data = NULL;
|
||||||
|
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: unable to create context\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
if (params.n_ctx > n_ctx_train) {
|
if (params.n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
|
14
ggml.c
14
ggml.c
@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
||||||
|
|
||||||
ggml_collect_imatrix_t g_imatrix_collect = NULL;
|
|
||||||
|
|
||||||
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
|
|
||||||
g_imatrix_collect = imatrix_collect;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I8] = {
|
[GGML_TYPE_I8] = {
|
||||||
.type_name = "i8",
|
.type_name = "i8",
|
||||||
@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
if (ith == 1 && g_imatrix_collect) {
|
|
||||||
g_imatrix_collect(src0, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const enum ggml_type type = src0->type;
|
const enum ggml_type type = src0->type;
|
||||||
|
|
||||||
const bool src1_cont = ggml_is_contiguous(src1);
|
const bool src1_cont = ggml_is_contiguous(src1);
|
||||||
@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|||||||
|
|
||||||
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
||||||
|
|
||||||
if (ith == 1 && g_imatrix_collect) {
|
|
||||||
g_imatrix_collect(src0_cur, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
|
6
ggml.h
6
ggml.h
@ -2075,12 +2075,6 @@ extern "C" {
|
|||||||
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
||||||
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
||||||
|
|
||||||
//
|
|
||||||
// Importance matrix
|
|
||||||
//
|
|
||||||
typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
|
||||||
GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// gguf
|
// gguf
|
||||||
//
|
//
|
||||||
|
Loading…
Reference in New Issue
Block a user