From f15167a4c7532101aa61e3e093a92801bf0d3ead Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 10 Jul 2024 02:21:38 +0200 Subject: [PATCH 1/2] cuda : do not use dmmv if the tensor does not have enough cols --- ggml/src/ggml-cuda.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 1c9ccc8a1..dfd75e0e7 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1875,7 +1875,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1; + && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2 + && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; From 9841fbda7ceed5226283d7ad254b0d8f72305145 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 10 Jul 2024 02:21:53 +0200 Subject: [PATCH 2/2] llama : lora fixes --- src/llama.cpp | 83 ++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 48 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 278c7912d..fda48e822 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2831,7 +2831,7 @@ struct llama_context { struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; - llama_lora_weight() {} + llama_lora_weight() = default; llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} }; @@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { - static const int n_inp_tensors = 5; // see llama_model - static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ false, + /* .no_alloc = */ true, /* .ctx = */ &ctx, }; struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); @@ -18536,7 +18534,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c // check metadata { auto get_kv_str = [&](std::string key) -> std::string { - std::vector str_buf(32, 0); // we only get the arch, so no need big buffer here int id = gguf_find_key(ctx_gguf, key.c_str()); return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); }; @@ -18544,50 +18541,36 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); auto lora_arch = llm_arch_from_string(lora_arch_name); if (lora_arch != model->arch) { + gguf_free(ctx_gguf); throw std::runtime_error("model arch and LoRA arch mismatch"); } + auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE)); if (train_type != "finetune_lora") { + gguf_free(ctx_gguf); throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type); } } - // calculate n_tensors_per_layer - int n_tensors_per_layer = 0; - { - int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); - for (int i = 0; i < n_tensors; i++) { - int il = -1; - sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il); - if (il == 0) n_tensors_per_layer++; - } - } + int n_tensors = gguf_get_n_tensors(ctx_gguf); - // count layer buffer types - std::map buft_tensor_count; - for (int64_t i = 0; i < model->hparams.n_layer; i++) { - buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer; - } - buft_tensor_count[model->buft_input.buft] += n_inp_tensors; - buft_tensor_count[model->buft_output.buft] += n_out_tensors; - - // allocate contexts + // contexts for each buffer type std::map ctx_map; - { - auto new_ggml_ctx = [](size_t n_tensors) { + auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + // add a new context struct ggml_init_params params = { /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - return ggml_init(params); + ggml_context * buft_ctx = ggml_init(params); + ctx_map[buft] = buft_ctx; + return buft_ctx; }; - for (auto & it : buft_tensor_count) { - int n_tensors = it.second; - // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second); - ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors - } - } + return it->second; + }; // bundle lora_a and lora_b into pairs std::map ab_map; @@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ab_map[name].b = cur; } } else { - // maybe "optimizer.*"" tensors - LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name); + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } } // add tensors for (auto & it : ab_map) { - std::string name = it.first; - const char * cname = name.c_str(); + const std::string & name = it.first; llama_lora_weight & w = it.second; - GGML_ASSERT(w.a != nullptr); - GGML_ASSERT(w.b != nullptr); - int il = -1; - sscanf(cname, "blk.%d.", &il); + + if (!w.a || !w.b) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); + } + // device buft and device ctx - auto model_tensor = llama_get_model_tensor(model, cname); + auto * model_tensor = llama_get_model_tensor(model, name.c_str()); if (!model_tensor) { gguf_free(ctx_gguf); ggml_free(ctx); throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); } - struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer)); + struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); // validate tensor shape if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + gguf_free(ctx_gguf); + ggml_free(ctx); throw std::runtime_error("tensor '" + name + "' has incorrect shape"); } if (w.a->ne[1] != w.b->ne[0]) { + gguf_free(ctx_gguf); + ggml_free(ctx); throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); } // save tensor to adapter @@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ggml_free(ctx); throw std::runtime_error("failed to allocate buffer for lora adapter\n"); } - ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); adapter.ctxs.push_back(ctx_dev); adapter.bufs.push_back(buf); } @@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); size_t size = ggml_nbytes(orig); - if (read_buf.size() < size) { - read_buf.resize(size); - } + read_buf.resize(size); gguf_file.seek(offs, SEEK_SET); gguf_file.read_raw(read_buf.data(), size); - // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size); ggml_backend_tensor_set(dev, read_buf.data(), 0, size); }; for (auto & it : adapter.ab_map) {