Merge pull request #9 from ggerganov/sl/fix_fix_lora

fix lora issues
This commit is contained in:
Xuan Son Nguyen 2024-07-10 10:33:42 +02:00 committed by GitHub
commit 4fe0861a89
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 37 additions and 49 deletions

View File

@ -1875,7 +1875,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1; && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
&& src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;

View File

@ -2831,7 +2831,7 @@ struct llama_context {
struct llama_lora_weight { struct llama_lora_weight {
struct ggml_tensor * a = nullptr; struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr; struct ggml_tensor * b = nullptr;
llama_lora_weight() {} llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
}; };
@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
static const int n_inp_tensors = 5; // see llama_model
static const int n_out_tensors = 5; // see llama_model
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
ggml_context * ctx = nullptr; ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = { struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false, /* .no_alloc = */ true,
/* .ctx = */ &ctx, /* .ctx = */ &ctx,
}; };
struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
@ -18536,7 +18534,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
// check metadata // check metadata
{ {
auto get_kv_str = [&](std::string key) -> std::string { auto get_kv_str = [&](std::string key) -> std::string {
std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
int id = gguf_find_key(ctx_gguf, key.c_str()); int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
}; };
@ -18544,50 +18541,36 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
auto lora_arch = llm_arch_from_string(lora_arch_name); auto lora_arch = llm_arch_from_string(lora_arch_name);
if (lora_arch != model->arch) { if (lora_arch != model->arch) {
gguf_free(ctx_gguf);
throw std::runtime_error("model arch and LoRA arch mismatch"); throw std::runtime_error("model arch and LoRA arch mismatch");
} }
auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE)); auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
if (train_type != "finetune_lora") { if (train_type != "finetune_lora") {
gguf_free(ctx_gguf);
throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type); throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
} }
} }
// calculate n_tensors_per_layer int n_tensors = gguf_get_n_tensors(ctx_gguf);
int n_tensors_per_layer = 0;
{
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
int il = -1;
sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il);
if (il == 0) n_tensors_per_layer++;
}
}
// count layer buffer types // contexts for each buffer type
std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
for (int64_t i = 0; i < model->hparams.n_layer; i++) {
buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer;
}
buft_tensor_count[model->buft_input.buft] += n_inp_tensors;
buft_tensor_count[model->buft_output.buft] += n_out_tensors;
// allocate contexts
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
{ auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto new_ggml_ctx = [](size_t n_tensors) { auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
// add a new context
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(), /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
}; };
return ggml_init(params); ggml_context * buft_ctx = ggml_init(params);
ctx_map[buft] = buft_ctx;
return buft_ctx;
};
return it->second;
}; };
for (auto & it : buft_tensor_count) {
int n_tensors = it.second;
// LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
}
}
// bundle lora_a and lora_b into pairs // bundle lora_a and lora_b into pairs
std::map<std::string, llama_lora_weight> ab_map; std::map<std::string, llama_lora_weight> ab_map;
@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
ab_map[name].b = cur; ab_map[name].b = cur;
} }
} else { } else {
// maybe "optimizer.*"" tensors gguf_free(ctx_gguf);
LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name); ggml_free(ctx);
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
} }
} }
// add tensors // add tensors
for (auto & it : ab_map) { for (auto & it : ab_map) {
std::string name = it.first; const std::string & name = it.first;
const char * cname = name.c_str();
llama_lora_weight & w = it.second; llama_lora_weight & w = it.second;
GGML_ASSERT(w.a != nullptr);
GGML_ASSERT(w.b != nullptr); if (!w.a || !w.b) {
int il = -1; gguf_free(ctx_gguf);
sscanf(cname, "blk.%d.", &il); ggml_free(ctx);
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
}
// device buft and device ctx // device buft and device ctx
auto model_tensor = llama_get_model_tensor(model, cname); auto * model_tensor = llama_get_model_tensor(model, name.c_str());
if (!model_tensor) { if (!model_tensor) {
gguf_free(ctx_gguf); gguf_free(ctx_gguf);
ggml_free(ctx); ggml_free(ctx);
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
} }
struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer)); struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape // validate tensor shape
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
gguf_free(ctx_gguf);
ggml_free(ctx);
throw std::runtime_error("tensor '" + name + "' has incorrect shape"); throw std::runtime_error("tensor '" + name + "' has incorrect shape");
} }
if (w.a->ne[1] != w.b->ne[0]) { if (w.a->ne[1] != w.b->ne[0]) {
gguf_free(ctx_gguf);
ggml_free(ctx);
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
} }
// save tensor to adapter // save tensor to adapter
@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
ggml_free(ctx); ggml_free(ctx);
throw std::runtime_error("failed to allocate buffer for lora adapter\n"); throw std::runtime_error("failed to allocate buffer for lora adapter\n");
} }
ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
adapter.ctxs.push_back(ctx_dev); adapter.ctxs.push_back(ctx_dev);
adapter.bufs.push_back(buf); adapter.bufs.push_back(buf);
} }
@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
size_t size = ggml_nbytes(orig); size_t size = ggml_nbytes(orig);
if (read_buf.size() < size) {
read_buf.resize(size); read_buf.resize(size);
}
gguf_file.seek(offs, SEEK_SET); gguf_file.seek(offs, SEEK_SET);
gguf_file.read_raw(read_buf.data(), size); gguf_file.read_raw(read_buf.data(), size);
// LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
ggml_backend_tensor_set(dev, read_buf.data(), 0, size); ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
}; };
for (auto & it : adapter.ab_map) { for (auto & it : adapter.ab_map) {