diff --git a/include/llama.h b/include/llama.h index a4abf395b..bb6970cb7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -385,6 +385,7 @@ extern "C" { } llama_chat_message; // lora adapter + // TODO: rename to llama_adapter_lora struct llama_lora_adapter; // Helpers for getting default parameters @@ -501,14 +502,20 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); + // + // Adapters + // + // Load a LoRA adapter from file // The loaded adapter will be associated to the given model, and will be free when the model is deleted + // TODO: rename to llama_adapter_lora_init LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( struct llama_model * model, const char * path_lora); // Add a loaded LoRA adapter to given context // This will not modify model's weight + // TODO: rename to llama_set_adapter_lora LLAMA_API int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, @@ -516,16 +523,19 @@ extern "C" { // Remove a specific LoRA adapter from given context // Return -1 if the adapter is not present in the context + // TODO: rename to llama_rm_adapter_lora LLAMA_API int32_t llama_lora_adapter_remove( struct llama_context * ctx, struct llama_lora_adapter * adapter); // Remove all LoRA adapters from given context + // TODO: rename to llama_clear_adapter_lora LLAMA_API void llama_lora_adapter_clear( struct llama_context * ctx); // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted + // TODO: rename to llama_adapter_lora_free LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); // Apply a loaded control vector to a llama_context, or if data is NULL, clear @@ -534,6 +544,7 @@ extern "C" { // to an n_embd x n_layers buffer starting from layer 1. // il_start and il_end are the layer range the vector should apply to (both inclusive) // See llama_control_vector_load in common to load a control vector. + // TODO: rename to llama_adapter_vec_apply LLAMA_API int32_t llama_control_vector_apply( struct llama_context * lctx, const float * data, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f631da88f..666632c25 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,7 +12,7 @@ add_library(llama llama-arch.cpp llama-batch.cpp llama-context.cpp - llama-control-vector.cpp + llama-adapter.cpp llama-grammar.cpp llama-kv-cache.cpp llama-mmap.cpp diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp new file mode 100644 index 000000000..587210512 --- /dev/null +++ b/src/llama-adapter.cpp @@ -0,0 +1,5 @@ +#include "llama-adapter.h" + +void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { + delete adapter; +} diff --git a/src/llama-adapter.h b/src/llama-adapter.h new file mode 100644 index 000000000..931dfdb5e --- /dev/null +++ b/src/llama-adapter.h @@ -0,0 +1,354 @@ +#pragma once + +#include "llama-impl.h" +#include "ggml-cpp.h" + +#include "llama-model.h" // TODO: need only hparams + +#include +#include + +// +// llama_adapter_vec +// + +// TODO: rename to llama_adapter_vec +struct llama_control_vector { + std::vector tensors; // per layer + std::vector ctxs; + std::vector bufs; + + int32_t layer_start = -1; + int32_t layer_end = -1; + + struct ggml_tensor * tensor_for(int il) const { + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { + return nullptr; + } + return tensors[il]; + } + + struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { + ggml_tensor * layer_dir = tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx, cur, layer_dir); + } + return cur; + } +}; + +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); + + // create a context for each buffer type + std::map ctx_map; + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + struct ggml_init_params params = { + /*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + return nullptr; + } + ctx_map[buft] = ctx; + cvec.ctxs.emplace_back(ctx); + return ctx; + } + return it->second; + }; + + // make tensors + cvec.tensors.reserve(model.hparams.n_layer); + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + for (size_t il = 1; il < model.hparams.n_layer; il++) { + ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list, + [&](ggml_context * ctx) { + ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + return ggml_add(ctx, cur, layer_dir); + }); + ggml_context * ctx = ctx_for_buft(buft); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + return false; + } + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + cvec.tensors.push_back(tensor); + } + + // allocate tensors / buffers and zero + cvec.bufs.reserve(ctx_map.size()); + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + cvec.bufs.emplace_back(buf); + } + + return true; +} + +static int32_t llama_control_vector_apply(struct llama_control_vector & cvec, const llama_model & model, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { + if (data == nullptr) { + // disable the current control vector (but leave allocated for later) + cvec.layer_start = -1; + cvec.layer_end = -1; + return 0; + } + + if (n_embd != (int) model.hparams.n_embd) { + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); + return 1; + } + + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; + } + } + + cvec.layer_start = il_start; + cvec.layer_end = il_end; + + for (size_t il = 1; il < model.hparams.n_layer; il++) { + assert(cvec.tensors[il] != nullptr); + + const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + if (off + n_embd <= len) { + ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); + } + } + + return 0; +} + +// +// llama_adapter_lora +// + +// TODO: rename to llama_adapter_lora_weight +struct llama_lora_weight { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + llama_lora_weight() = default; + llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} +}; + +// TODO: rename to llama_adapter_lora +struct llama_lora_adapter { + struct llama_model * base_model; + // map tensor name to lora_a_b + std::unordered_map ab_map; + std::vector ctxs; + std::vector bufs; + + float alpha; + + llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { + base_model->lora_adapters.insert(this); + } + + llama_lora_weight * get_weight(struct ggml_tensor * w) { + std::string name(w->name); + auto pos = ab_map.find(name); + if (ab_map.find(name) != ab_map.end()) { + return &pos->second; + } + return nullptr; + } + + ~llama_lora_adapter() { + auto pos = base_model->lora_adapters.find(this); + if (pos != base_model->lora_adapters.end()) { + base_model->lora_adapters.erase(pos); + } + } +}; + +static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) { + auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), + [name](const std::pair & it) { + return it.first == name; + }); + if (it == model->tensors_by_name.end()) { + return nullptr; + } + return it->second; +} + +static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { + LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); + + ggml_context * ctx_init; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &ctx_init, + }; + + gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; + if (!ctx_gguf) { + throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); + } + + ggml_context_ptr ctx { ctx_init }; + + // check metadata + { + auto get_kv_str = [&](const std::string & key) -> std::string { + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); + }; + auto get_kv_f32 = [&](const std::string & key) -> float { + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); + }; + LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); + + auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); + if (general_type != "adapter") { + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); + } + + auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); + auto general_arch = llm_arch_from_string(general_arch_str); + if (general_arch != model->arch) { + throw std::runtime_error("model arch and LoRA arch mismatch"); + } + + auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); + if (adapter_type != "lora") { + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); + } + + adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); + } + + int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); + + // contexts for each buffer type + std::map ctx_map; + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + // add a new context + struct ggml_init_params params = { + /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * buft_ctx = ggml_init(params); + if (!buft_ctx) { + return nullptr; + } + ctx_map[buft] = buft_ctx; + adapter.ctxs.emplace_back(buft_ctx); + return buft_ctx; + }; + return it->second; + }; + + // bundle lora_a and lora_b into pairs + std::map ab_map; + auto str_endswith = [](const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; + }; + for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { + std::string name(cur->name); + if (str_endswith(name, ".lora_a")) { + replace_all(name, ".lora_a", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = llama_lora_weight(cur, nullptr); + } else { + ab_map[name].a = cur; + } + } else if (str_endswith(name, ".lora_b")) { + replace_all(name, ".lora_b", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = llama_lora_weight(nullptr, cur); + } else { + ab_map[name].b = cur; + } + } else { + throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); + } + } + + // add tensors + for (auto & it : ab_map) { + const std::string & name = it.first; + llama_lora_weight & w = it.second; + + if (!w.a || !w.b) { + throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); + } + + // device buft and device ctx + auto * model_tensor = llama_get_model_tensor(model, name.c_str()); + if (!model_tensor) { + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); + } + struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); + // validate tensor shape + if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape"); + } + if (w.a->ne[1] != w.b->ne[0]) { + throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + } + // save tensor to adapter + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_set_name(tensor_a, w.a->name); + ggml_set_name(tensor_b, w.b->name); + adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); + } + + // allocate tensors / buffers and zero + { + adapter.ctxs.reserve(ctx_map.size()); + adapter.bufs.reserve(ctx_map.size()); + for (auto & it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx_dev = it.second; + ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; + if (!buf) { + throw std::runtime_error("failed to allocate buffer for lora adapter\n"); + } + LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); + adapter.bufs.emplace_back(std::move(buf)); + } + } + + // set tensor data + { + llama_file gguf_file(path_lora, "rb"); + std::vector read_buf; + auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { + size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); + size_t size = ggml_nbytes(orig); + read_buf.resize(size); + gguf_file.seek(offs, SEEK_SET); + gguf_file.read_raw(read_buf.data(), size); + ggml_backend_tensor_set(dev, read_buf.data(), 0, size); + }; + for (auto & it : adapter.ab_map) { + auto orig = ab_map[it.first]; + auto dev = it.second; + set_tensor(orig.a, dev.a); + set_tensor(orig.b, dev.b); + } + } + + LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); +} diff --git a/src/llama-context.h b/src/llama-context.h index bae5e8321..596ba0de1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -4,7 +4,7 @@ #include "llama-batch.h" #include "llama-model.h" #include "llama-kv-cache.h" -#include "llama-control-vector.h" +#include "llama-adapter.h" #include "ggml-cpp.h" @@ -54,7 +54,7 @@ struct llama_context { const struct llama_model & model; struct llama_cparams cparams; - struct llama_sbatch sbatch; + struct llama_sbatch sbatch; // TODO: revisit if needed struct llama_kv_cache kv_self; struct llama_control_vector cvec; diff --git a/src/llama-control-vector.cpp b/src/llama-control-vector.cpp deleted file mode 100644 index 3a4512aac..000000000 --- a/src/llama-control-vector.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "llama-control-vector.h" diff --git a/src/llama-control-vector.h b/src/llama-control-vector.h deleted file mode 100644 index 695fc2a3b..000000000 --- a/src/llama-control-vector.h +++ /dev/null @@ -1,130 +0,0 @@ -#pragma once - -#include "llama-impl.h" -#include "ggml-cpp.h" - -#include "llama-model.h" // TODO: need only hparams - -#include -#include - -struct llama_control_vector { - std::vector tensors; // per layer - std::vector ctxs; - std::vector bufs; - - int32_t layer_start = -1; - int32_t layer_end = -1; - - struct ggml_tensor * tensor_for(int il) const { - if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { - return nullptr; - } - return tensors[il]; - } - - struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { - ggml_tensor * layer_dir = tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx, cur, layer_dir); - } - return cur; - } -}; - -static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { - GGML_ASSERT(cvec.tensors.empty()); - GGML_ASSERT(cvec.ctxs.empty()); - GGML_ASSERT(cvec.bufs.empty()); - - // create a context for each buffer type - std::map ctx_map; - auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - auto it = ctx_map.find(buft); - if (it == ctx_map.end()) { - struct ggml_init_params params = { - /*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ggml_context * ctx = ggml_init(params); - if (!ctx) { - return nullptr; - } - ctx_map[buft] = ctx; - cvec.ctxs.emplace_back(ctx); - return ctx; - } - return it->second; - }; - - // make tensors - cvec.tensors.reserve(model.hparams.n_layer); - cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 - for (size_t il = 1; il < model.hparams.n_layer; il++) { - ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list, - [&](ggml_context * ctx) { - ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); - ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); - return ggml_add(ctx, cur, layer_dir); - }); - ggml_context * ctx = ctx_for_buft(buft); - if (!ctx) { - LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); - return false; - } - ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); - cvec.tensors.push_back(tensor); - } - - // allocate tensors / buffers and zero - cvec.bufs.reserve(ctx_map.size()); - for (auto it : ctx_map) { - ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); - return false; - } - ggml_backend_buffer_clear(buf, 0); - cvec.bufs.emplace_back(buf); - } - - return true; -} - -static int32_t llama_control_vector_apply(struct llama_control_vector & cvec, const llama_model & model, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { - if (data == nullptr) { - // disable the current control vector (but leave allocated for later) - cvec.layer_start = -1; - cvec.layer_end = -1; - return 0; - } - - if (n_embd != (int) model.hparams.n_embd) { - LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); - return 1; - } - - if (cvec.tensors.empty()) { - if (!llama_control_vector_init(cvec, model)) { - return 1; - } - } - - cvec.layer_start = il_start; - cvec.layer_end = il_end; - - for (size_t il = 1; il < model.hparams.n_layer; il++) { - assert(cvec.tensors[il] != nullptr); - - const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present - if (off + n_embd <= len) { - ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); - } - } - - return 0; -} - diff --git a/src/llama.cpp b/src/llama.cpp index 9503b4204..b7f55ae4e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -230,58 +230,10 @@ static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; -struct llama_lora_weight { - struct ggml_tensor * a = nullptr; - struct ggml_tensor * b = nullptr; - llama_lora_weight() = default; - llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} -}; - -struct llama_lora_adapter { - struct llama_model * base_model; - // map tensor name to lora_a_b - std::unordered_map ab_map; - std::vector ctxs; - std::vector bufs; - - float alpha; - - llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { - base_model->lora_adapters.insert(this); - } - - llama_lora_weight * get_weight(struct ggml_tensor * w) { - std::string name(w->name); - auto pos = ab_map.find(name); - if (ab_map.find(name) != ab_map.end()) { - return &pos->second; - } - return nullptr; - } - - ~llama_lora_adapter() { - auto pos = base_model->lora_adapters.find(this); - if (pos != base_model->lora_adapters.end()) { - base_model->lora_adapters.erase(pos); - } - } -}; - static int llama_get_device_count(const llama_model & model) { return (int) model.devices.size(); } -static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) { - auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), - [name](const std::pair & it) { - return it.first == name; - }); - if (it == model->tensors_by_name.end()) { - return nullptr; - } - return it->second; -} - // // model loading and saving // @@ -15884,172 +15836,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { - LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); - - ggml_context * ctx_init; - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &ctx_init, - }; - - gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; - if (!ctx_gguf) { - throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); - } - - ggml_context_ptr ctx { ctx_init }; - - // check metadata - { - auto get_kv_str = [&](const std::string & key) -> std::string { - int id = gguf_find_key(ctx_gguf.get(), key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); - }; - auto get_kv_f32 = [&](const std::string & key) -> float { - int id = gguf_find_key(ctx_gguf.get(), key.c_str()); - return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); - }; - LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - - auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); - if (general_type != "adapter") { - throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); - } - - auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); - auto general_arch = llm_arch_from_string(general_arch_str); - if (general_arch != model->arch) { - throw std::runtime_error("model arch and LoRA arch mismatch"); - } - - auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); - if (adapter_type != "lora") { - throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); - } - - adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); - } - - int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); - - // contexts for each buffer type - std::map ctx_map; - auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - auto it = ctx_map.find(buft); - if (it == ctx_map.end()) { - // add a new context - struct ggml_init_params params = { - /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ggml_context * buft_ctx = ggml_init(params); - if (!buft_ctx) { - return nullptr; - } - ctx_map[buft] = buft_ctx; - adapter.ctxs.emplace_back(buft_ctx); - return buft_ctx; - }; - return it->second; - }; - - // bundle lora_a and lora_b into pairs - std::map ab_map; - auto str_endswith = [](const std::string & str, const std::string & suffix) { - return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; - }; - for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { - std::string name(cur->name); - if (str_endswith(name, ".lora_a")) { - replace_all(name, ".lora_a", ""); - if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = llama_lora_weight(cur, nullptr); - } else { - ab_map[name].a = cur; - } - } else if (str_endswith(name, ".lora_b")) { - replace_all(name, ".lora_b", ""); - if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = llama_lora_weight(nullptr, cur); - } else { - ab_map[name].b = cur; - } - } else { - throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); - } - } - - // add tensors - for (auto & it : ab_map) { - const std::string & name = it.first; - llama_lora_weight & w = it.second; - - if (!w.a || !w.b) { - throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); - } - - // device buft and device ctx - auto * model_tensor = llama_get_model_tensor(model, name.c_str()); - if (!model_tensor) { - throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); - } - struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); - // validate tensor shape - if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { - throw std::runtime_error("tensor '" + name + "' has incorrect shape"); - } - if (w.a->ne[1] != w.b->ne[0]) { - throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); - } - // save tensor to adapter - struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); - struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); - ggml_set_name(tensor_a, w.a->name); - ggml_set_name(tensor_b, w.b->name); - adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); - } - - // allocate tensors / buffers and zero - { - adapter.ctxs.reserve(ctx_map.size()); - adapter.bufs.reserve(ctx_map.size()); - for (auto & it : ctx_map) { - ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx_dev = it.second; - ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; - if (!buf) { - throw std::runtime_error("failed to allocate buffer for lora adapter\n"); - } - LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); - adapter.bufs.emplace_back(std::move(buf)); - } - } - - // set tensor data - { - llama_file gguf_file(path_lora, "rb"); - std::vector read_buf; - auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { - size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); - size_t size = ggml_nbytes(orig); - read_buf.resize(size); - gguf_file.seek(offs, SEEK_SET); - gguf_file.read_raw(read_buf.data(), size); - ggml_backend_tensor_set(dev, read_buf.data(), 0, size); - }; - for (auto & it : adapter.ab_map) { - auto orig = ab_map[it.first]; - auto dev = it.second; - set_tensor(orig.a, dev.a); - set_tensor(orig.b, dev.b); - } - } - - LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); -} - int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, @@ -16077,10 +15863,6 @@ void llama_lora_adapter_clear(struct llama_context * ctx) { ctx->lora_adapters.clear(); } -void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { - delete adapter; -} - // TODO: tmp int32_t llama_control_vector_apply( struct llama_context * lctx,