llama : control-vector -> adapter

This commit is contained in:
Georgi Gerganov 2024-12-22 15:49:03 +02:00
parent f9b0e3b382
commit 7b5b594526
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
8 changed files with 373 additions and 352 deletions

View File

@ -385,6 +385,7 @@ extern "C" {
} llama_chat_message;
// lora adapter
// TODO: rename to llama_adapter_lora
struct llama_lora_adapter;
// Helpers for getting default parameters
@ -501,14 +502,20 @@ extern "C" {
const char * fname_out,
const llama_model_quantize_params * params);
//
// Adapters
//
// Load a LoRA adapter from file
// The loaded adapter will be associated to the given model, and will be free when the model is deleted
// TODO: rename to llama_adapter_lora_init
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
struct llama_model * model,
const char * path_lora);
// Add a loaded LoRA adapter to given context
// This will not modify model's weight
// TODO: rename to llama_set_adapter_lora
LLAMA_API int32_t llama_lora_adapter_set(
struct llama_context * ctx,
struct llama_lora_adapter * adapter,
@ -516,16 +523,19 @@ extern "C" {
// Remove a specific LoRA adapter from given context
// Return -1 if the adapter is not present in the context
// TODO: rename to llama_rm_adapter_lora
LLAMA_API int32_t llama_lora_adapter_remove(
struct llama_context * ctx,
struct llama_lora_adapter * adapter);
// Remove all LoRA adapters from given context
// TODO: rename to llama_clear_adapter_lora
LLAMA_API void llama_lora_adapter_clear(
struct llama_context * ctx);
// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
// TODO: rename to llama_adapter_lora_free
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
@ -534,6 +544,7 @@ extern "C" {
// to an n_embd x n_layers buffer starting from layer 1.
// il_start and il_end are the layer range the vector should apply to (both inclusive)
// See llama_control_vector_load in common to load a control vector.
// TODO: rename to llama_adapter_vec_apply
LLAMA_API int32_t llama_control_vector_apply(
struct llama_context * lctx,
const float * data,

View File

@ -12,7 +12,7 @@ add_library(llama
llama-arch.cpp
llama-batch.cpp
llama-context.cpp
llama-control-vector.cpp
llama-adapter.cpp
llama-grammar.cpp
llama-kv-cache.cpp
llama-mmap.cpp

5
src/llama-adapter.cpp Normal file
View File

@ -0,0 +1,5 @@
#include "llama-adapter.h"
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
delete adapter;
}

354
src/llama-adapter.h Normal file
View File

@ -0,0 +1,354 @@
#pragma once
#include "llama-impl.h"
#include "ggml-cpp.h"
#include "llama-model.h" // TODO: need only hparams
#include <vector>
#include <map>
//
// llama_adapter_vec
//
// TODO: rename to llama_adapter_vec
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
int32_t layer_start = -1;
int32_t layer_end = -1;
struct ggml_tensor * tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}
return tensors[il];
}
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir);
}
return cur;
}
};
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
GGML_ASSERT(cvec.tensors.empty());
GGML_ASSERT(cvec.ctxs.empty());
GGML_ASSERT(cvec.bufs.empty());
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
struct ggml_init_params params = {
/*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
return nullptr;
}
ctx_map[buft] = ctx;
cvec.ctxs.emplace_back(ctx);
return ctx;
}
return it->second;
};
// make tensors
cvec.tensors.reserve(model.hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < model.hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
[&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
return ggml_add(ctx, cur, layer_dir);
});
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
return false;
}
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
cvec.tensors.push_back(tensor);
}
// allocate tensors / buffers and zero
cvec.bufs.reserve(ctx_map.size());
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
return false;
}
ggml_backend_buffer_clear(buf, 0);
cvec.bufs.emplace_back(buf);
}
return true;
}
static int32_t llama_control_vector_apply(struct llama_control_vector & cvec, const llama_model & model, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
if (data == nullptr) {
// disable the current control vector (but leave allocated for later)
cvec.layer_start = -1;
cvec.layer_end = -1;
return 0;
}
if (n_embd != (int) model.hparams.n_embd) {
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
return 1;
}
if (cvec.tensors.empty()) {
if (!llama_control_vector_init(cvec, model)) {
return 1;
}
}
cvec.layer_start = il_start;
cvec.layer_end = il_end;
for (size_t il = 1; il < model.hparams.n_layer; il++) {
assert(cvec.tensors[il] != nullptr);
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
if (off + n_embd <= len) {
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
}
}
return 0;
}
//
// llama_adapter_lora
//
// TODO: rename to llama_adapter_lora_weight
struct llama_lora_weight {
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;
llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
};
// TODO: rename to llama_adapter_lora
struct llama_lora_adapter {
struct llama_model * base_model;
// map tensor name to lora_a_b
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
float alpha;
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
base_model->lora_adapters.insert(this);
}
llama_lora_weight * get_weight(struct ggml_tensor * w) {
std::string name(w->name);
auto pos = ab_map.find(name);
if (ab_map.find(name) != ab_map.end()) {
return &pos->second;
}
return nullptr;
}
~llama_lora_adapter() {
auto pos = base_model->lora_adapters.find(this);
if (pos != base_model->lora_adapters.end()) {
base_model->lora_adapters.erase(pos);
}
}
};
static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
return it.first == name;
});
if (it == model->tensors_by_name.end()) {
return nullptr;
}
return it->second;
}
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
ggml_context * ctx_init;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &ctx_init,
};
gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
if (!ctx_gguf) {
throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
}
ggml_context_ptr ctx { ctx_init };
// check metadata
{
auto get_kv_str = [&](const std::string & key) -> std::string {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
};
auto get_kv_f32 = [&](const std::string & key) -> float {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
};
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
if (general_type != "adapter") {
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
}
auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
auto general_arch = llm_arch_from_string(general_arch_str);
if (general_arch != model->arch) {
throw std::runtime_error("model arch and LoRA arch mismatch");
}
auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
if (adapter_type != "lora") {
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
}
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
}
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
// contexts for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
// add a new context
struct ggml_init_params params = {
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * buft_ctx = ggml_init(params);
if (!buft_ctx) {
return nullptr;
}
ctx_map[buft] = buft_ctx;
adapter.ctxs.emplace_back(buft_ctx);
return buft_ctx;
};
return it->second;
};
// bundle lora_a and lora_b into pairs
std::map<std::string, llama_lora_weight> ab_map;
auto str_endswith = [](const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
};
for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
std::string name(cur->name);
if (str_endswith(name, ".lora_a")) {
replace_all(name, ".lora_a", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(cur, nullptr);
} else {
ab_map[name].a = cur;
}
} else if (str_endswith(name, ".lora_b")) {
replace_all(name, ".lora_b", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(nullptr, cur);
} else {
ab_map[name].b = cur;
}
} else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
}
}
// add tensors
for (auto & it : ab_map) {
const std::string & name = it.first;
llama_lora_weight & w = it.second;
if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
}
// device buft and device ctx
auto * model_tensor = llama_get_model_tensor(model, name.c_str());
if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
}
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
}
if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
}
// save tensor to adapter
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
ggml_set_name(tensor_a, w.a->name);
ggml_set_name(tensor_b, w.b->name);
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
}
// allocate tensors / buffers and zero
{
adapter.ctxs.reserve(ctx_map.size());
adapter.bufs.reserve(ctx_map.size());
for (auto & it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx_dev = it.second;
ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
if (!buf) {
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
}
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
adapter.bufs.emplace_back(std::move(buf));
}
}
// set tensor data
{
llama_file gguf_file(path_lora, "rb");
std::vector<uint8_t> read_buf;
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
size_t size = ggml_nbytes(orig);
read_buf.resize(size);
gguf_file.seek(offs, SEEK_SET);
gguf_file.read_raw(read_buf.data(), size);
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
};
for (auto & it : adapter.ab_map) {
auto orig = ab_map[it.first];
auto dev = it.second;
set_tensor(orig.a, dev.a);
set_tensor(orig.b, dev.b);
}
}
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}

View File

@ -4,7 +4,7 @@
#include "llama-batch.h"
#include "llama-model.h"
#include "llama-kv-cache.h"
#include "llama-control-vector.h"
#include "llama-adapter.h"
#include "ggml-cpp.h"
@ -54,7 +54,7 @@ struct llama_context {
const struct llama_model & model;
struct llama_cparams cparams;
struct llama_sbatch sbatch;
struct llama_sbatch sbatch; // TODO: revisit if needed
struct llama_kv_cache kv_self;
struct llama_control_vector cvec;

View File

@ -1 +0,0 @@
#include "llama-control-vector.h"

View File

@ -1,130 +0,0 @@
#pragma once
#include "llama-impl.h"
#include "ggml-cpp.h"
#include "llama-model.h" // TODO: need only hparams
#include <vector>
#include <map>
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
int32_t layer_start = -1;
int32_t layer_end = -1;
struct ggml_tensor * tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}
return tensors[il];
}
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir);
}
return cur;
}
};
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
GGML_ASSERT(cvec.tensors.empty());
GGML_ASSERT(cvec.ctxs.empty());
GGML_ASSERT(cvec.bufs.empty());
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
struct ggml_init_params params = {
/*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
return nullptr;
}
ctx_map[buft] = ctx;
cvec.ctxs.emplace_back(ctx);
return ctx;
}
return it->second;
};
// make tensors
cvec.tensors.reserve(model.hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < model.hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
[&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
return ggml_add(ctx, cur, layer_dir);
});
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
return false;
}
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
cvec.tensors.push_back(tensor);
}
// allocate tensors / buffers and zero
cvec.bufs.reserve(ctx_map.size());
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
return false;
}
ggml_backend_buffer_clear(buf, 0);
cvec.bufs.emplace_back(buf);
}
return true;
}
static int32_t llama_control_vector_apply(struct llama_control_vector & cvec, const llama_model & model, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
if (data == nullptr) {
// disable the current control vector (but leave allocated for later)
cvec.layer_start = -1;
cvec.layer_end = -1;
return 0;
}
if (n_embd != (int) model.hparams.n_embd) {
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
return 1;
}
if (cvec.tensors.empty()) {
if (!llama_control_vector_init(cvec, model)) {
return 1;
}
}
cvec.layer_start = il_start;
cvec.layer_end = il_end;
for (size_t il = 1; il < model.hparams.n_layer; il++) {
assert(cvec.tensors[il] != nullptr);
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
if (off + n_embd <= len) {
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
}
}
return 0;
}

View File

@ -230,58 +230,10 @@ static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB;
struct llama_lora_weight {
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;
llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
};
struct llama_lora_adapter {
struct llama_model * base_model;
// map tensor name to lora_a_b
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
float alpha;
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
base_model->lora_adapters.insert(this);
}
llama_lora_weight * get_weight(struct ggml_tensor * w) {
std::string name(w->name);
auto pos = ab_map.find(name);
if (ab_map.find(name) != ab_map.end()) {
return &pos->second;
}
return nullptr;
}
~llama_lora_adapter() {
auto pos = base_model->lora_adapters.find(this);
if (pos != base_model->lora_adapters.end()) {
base_model->lora_adapters.erase(pos);
}
}
};
static int llama_get_device_count(const llama_model & model) {
return (int) model.devices.size();
}
static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
return it.first == name;
});
if (it == model->tensors_by_name.end()) {
return nullptr;
}
return it->second;
}
//
// model loading and saving
//
@ -15884,172 +15836,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
ggml_context * ctx_init;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &ctx_init,
};
gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
if (!ctx_gguf) {
throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
}
ggml_context_ptr ctx { ctx_init };
// check metadata
{
auto get_kv_str = [&](const std::string & key) -> std::string {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
};
auto get_kv_f32 = [&](const std::string & key) -> float {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
};
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
if (general_type != "adapter") {
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
}
auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
auto general_arch = llm_arch_from_string(general_arch_str);
if (general_arch != model->arch) {
throw std::runtime_error("model arch and LoRA arch mismatch");
}
auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
if (adapter_type != "lora") {
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
}
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
}
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
// contexts for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
// add a new context
struct ggml_init_params params = {
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * buft_ctx = ggml_init(params);
if (!buft_ctx) {
return nullptr;
}
ctx_map[buft] = buft_ctx;
adapter.ctxs.emplace_back(buft_ctx);
return buft_ctx;
};
return it->second;
};
// bundle lora_a and lora_b into pairs
std::map<std::string, llama_lora_weight> ab_map;
auto str_endswith = [](const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
};
for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
std::string name(cur->name);
if (str_endswith(name, ".lora_a")) {
replace_all(name, ".lora_a", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(cur, nullptr);
} else {
ab_map[name].a = cur;
}
} else if (str_endswith(name, ".lora_b")) {
replace_all(name, ".lora_b", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(nullptr, cur);
} else {
ab_map[name].b = cur;
}
} else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
}
}
// add tensors
for (auto & it : ab_map) {
const std::string & name = it.first;
llama_lora_weight & w = it.second;
if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
}
// device buft and device ctx
auto * model_tensor = llama_get_model_tensor(model, name.c_str());
if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
}
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
}
if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
}
// save tensor to adapter
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
ggml_set_name(tensor_a, w.a->name);
ggml_set_name(tensor_b, w.b->name);
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
}
// allocate tensors / buffers and zero
{
adapter.ctxs.reserve(ctx_map.size());
adapter.bufs.reserve(ctx_map.size());
for (auto & it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx_dev = it.second;
ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
if (!buf) {
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
}
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
adapter.bufs.emplace_back(std::move(buf));
}
}
// set tensor data
{
llama_file gguf_file(path_lora, "rb");
std::vector<uint8_t> read_buf;
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
size_t size = ggml_nbytes(orig);
read_buf.resize(size);
gguf_file.seek(offs, SEEK_SET);
gguf_file.read_raw(read_buf.data(), size);
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
};
for (auto & it : adapter.ab_map) {
auto orig = ab_map[it.first];
auto dev = it.second;
set_tensor(orig.a, dev.a);
set_tensor(orig.b, dev.b);
}
}
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}
int32_t llama_lora_adapter_set(
struct llama_context * ctx,
struct llama_lora_adapter * adapter,
@ -16077,10 +15863,6 @@ void llama_lora_adapter_clear(struct llama_context * ctx) {
ctx->lora_adapters.clear();
}
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
delete adapter;
}
// TODO: tmp
int32_t llama_control_vector_apply(
struct llama_context * lctx,