mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-15 23:00:46 +01:00
commit
4fe0861a89
@ -1875,7 +1875,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||||||
|
|
||||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
|
||||||
|
&& src1->ne[1] == 1;
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
|
@ -2831,7 +2831,7 @@ struct llama_context {
|
|||||||
struct llama_lora_weight {
|
struct llama_lora_weight {
|
||||||
struct ggml_tensor * a = nullptr;
|
struct ggml_tensor * a = nullptr;
|
||||||
struct ggml_tensor * b = nullptr;
|
struct ggml_tensor * b = nullptr;
|
||||||
llama_lora_weight() {}
|
llama_lora_weight() = default;
|
||||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
|
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
|
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
|
||||||
static const int n_inp_tensors = 5; // see llama_model
|
|
||||||
static const int n_out_tensors = 5; // see llama_model
|
|
||||||
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
|
||||||
|
|
||||||
ggml_context * ctx = nullptr;
|
ggml_context * ctx = nullptr;
|
||||||
struct gguf_init_params meta_gguf_params = {
|
struct gguf_init_params meta_gguf_params = {
|
||||||
/* .no_alloc = */ false,
|
/* .no_alloc = */ true,
|
||||||
/* .ctx = */ &ctx,
|
/* .ctx = */ &ctx,
|
||||||
};
|
};
|
||||||
struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
|
struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
|
||||||
@ -18536,7 +18534,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
|
|||||||
// check metadata
|
// check metadata
|
||||||
{
|
{
|
||||||
auto get_kv_str = [&](std::string key) -> std::string {
|
auto get_kv_str = [&](std::string key) -> std::string {
|
||||||
std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
|
|
||||||
int id = gguf_find_key(ctx_gguf, key.c_str());
|
int id = gguf_find_key(ctx_gguf, key.c_str());
|
||||||
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
|
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
|
||||||
};
|
};
|
||||||
@ -18544,50 +18541,36 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
|
|||||||
auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
|
auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
|
||||||
auto lora_arch = llm_arch_from_string(lora_arch_name);
|
auto lora_arch = llm_arch_from_string(lora_arch_name);
|
||||||
if (lora_arch != model->arch) {
|
if (lora_arch != model->arch) {
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
throw std::runtime_error("model arch and LoRA arch mismatch");
|
throw std::runtime_error("model arch and LoRA arch mismatch");
|
||||||
}
|
}
|
||||||
|
|
||||||
auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
|
auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
|
||||||
if (train_type != "finetune_lora") {
|
if (train_type != "finetune_lora") {
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
|
throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate n_tensors_per_layer
|
int n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||||
int n_tensors_per_layer = 0;
|
|
||||||
{
|
|
||||||
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
|
||||||
for (int i = 0; i < n_tensors; i++) {
|
|
||||||
int il = -1;
|
|
||||||
sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il);
|
|
||||||
if (il == 0) n_tensors_per_layer++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// count layer buffer types
|
// contexts for each buffer type
|
||||||
std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
|
|
||||||
for (int64_t i = 0; i < model->hparams.n_layer; i++) {
|
|
||||||
buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer;
|
|
||||||
}
|
|
||||||
buft_tensor_count[model->buft_input.buft] += n_inp_tensors;
|
|
||||||
buft_tensor_count[model->buft_output.buft] += n_out_tensors;
|
|
||||||
|
|
||||||
// allocate contexts
|
|
||||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||||
{
|
auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||||
auto new_ggml_ctx = [](size_t n_tensors) {
|
auto it = ctx_map.find(buft);
|
||||||
|
if (it == ctx_map.end()) {
|
||||||
|
// add a new context
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
|
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
/*.no_alloc =*/ true,
|
/*.no_alloc =*/ true,
|
||||||
};
|
};
|
||||||
return ggml_init(params);
|
ggml_context * buft_ctx = ggml_init(params);
|
||||||
|
ctx_map[buft] = buft_ctx;
|
||||||
|
return buft_ctx;
|
||||||
};
|
};
|
||||||
for (auto & it : buft_tensor_count) {
|
return it->second;
|
||||||
int n_tensors = it.second;
|
};
|
||||||
// LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
|
|
||||||
ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// bundle lora_a and lora_b into pairs
|
// bundle lora_a and lora_b into pairs
|
||||||
std::map<std::string, llama_lora_weight> ab_map;
|
std::map<std::string, llama_lora_weight> ab_map;
|
||||||
@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
|
|||||||
ab_map[name].b = cur;
|
ab_map[name].b = cur;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// maybe "optimizer.*"" tensors
|
gguf_free(ctx_gguf);
|
||||||
LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
|
ggml_free(ctx);
|
||||||
|
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// add tensors
|
// add tensors
|
||||||
for (auto & it : ab_map) {
|
for (auto & it : ab_map) {
|
||||||
std::string name = it.first;
|
const std::string & name = it.first;
|
||||||
const char * cname = name.c_str();
|
|
||||||
llama_lora_weight & w = it.second;
|
llama_lora_weight & w = it.second;
|
||||||
GGML_ASSERT(w.a != nullptr);
|
|
||||||
GGML_ASSERT(w.b != nullptr);
|
if (!w.a || !w.b) {
|
||||||
int il = -1;
|
gguf_free(ctx_gguf);
|
||||||
sscanf(cname, "blk.%d.", &il);
|
ggml_free(ctx);
|
||||||
|
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
||||||
|
}
|
||||||
|
|
||||||
// device buft and device ctx
|
// device buft and device ctx
|
||||||
auto model_tensor = llama_get_model_tensor(model, cname);
|
auto * model_tensor = llama_get_model_tensor(model, name.c_str());
|
||||||
if (!model_tensor) {
|
if (!model_tensor) {
|
||||||
gguf_free(ctx_gguf);
|
gguf_free(ctx_gguf);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
||||||
}
|
}
|
||||||
struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
|
struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
||||||
// validate tensor shape
|
// validate tensor shape
|
||||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
ggml_free(ctx);
|
||||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
||||||
}
|
}
|
||||||
if (w.a->ne[1] != w.b->ne[0]) {
|
if (w.a->ne[1] != w.b->ne[0]) {
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
ggml_free(ctx);
|
||||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||||
}
|
}
|
||||||
// save tensor to adapter
|
// save tensor to adapter
|
||||||
@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
|
|||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
|
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_clear(buf, 0);
|
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||||
adapter.ctxs.push_back(ctx_dev);
|
adapter.ctxs.push_back(ctx_dev);
|
||||||
adapter.bufs.push_back(buf);
|
adapter.bufs.push_back(buf);
|
||||||
}
|
}
|
||||||
@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
|
|||||||
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
|
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
|
||||||
size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
|
size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
|
||||||
size_t size = ggml_nbytes(orig);
|
size_t size = ggml_nbytes(orig);
|
||||||
if (read_buf.size() < size) {
|
read_buf.resize(size);
|
||||||
read_buf.resize(size);
|
|
||||||
}
|
|
||||||
gguf_file.seek(offs, SEEK_SET);
|
gguf_file.seek(offs, SEEK_SET);
|
||||||
gguf_file.read_raw(read_buf.data(), size);
|
gguf_file.read_raw(read_buf.data(), size);
|
||||||
// LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
|
|
||||||
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
|
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
|
||||||
};
|
};
|
||||||
for (auto & it : adapter.ab_map) {
|
for (auto & it : adapter.ab_map) {
|
||||||
|
Loading…
Reference in New Issue
Block a user