From f15167a4c7532101aa61e3e093a92801bf0d3ead Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 10 Jul 2024 02:21:38 +0200
Subject: [PATCH 1/2] cuda : do not use dmmv if the tensor does not have enough
 cols

---
 ggml/src/ggml-cuda.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 1c9ccc8a1..dfd75e0e7 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1875,7 +1875,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
 
     bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
+        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
+        && src1->ne[1] == 1;
     bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;

From 9841fbda7ceed5226283d7ad254b0d8f72305145 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 10 Jul 2024 02:21:53 +0200
Subject: [PATCH 2/2] llama : lora fixes

---
 src/llama.cpp | 83 ++++++++++++++++++++++-----------------------------
 1 file changed, 35 insertions(+), 48 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 278c7912d..fda48e822 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2831,7 +2831,7 @@ struct llama_context {
 struct llama_lora_weight {
     struct ggml_tensor * a = nullptr;
     struct ggml_tensor * b = nullptr;
-    llama_lora_weight() {}
+    llama_lora_weight() = default;
     llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
 };
 
@@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }
 
 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
-    static const int n_inp_tensors = 5; // see llama_model
-    static const int n_out_tensors = 5; // see llama_model
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
-        /* .no_alloc = */ false,
+        /* .no_alloc = */ true,
         /* .ctx      = */ &ctx,
     };
     struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
@@ -18536,7 +18534,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     // check metadata
     {
         auto get_kv_str = [&](std::string key) -> std::string {
-            std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
             int id = gguf_find_key(ctx_gguf, key.c_str());
             return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
         };
@@ -18544,50 +18541,36 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
         auto lora_arch = llm_arch_from_string(lora_arch_name);
         if (lora_arch != model->arch) {
+            gguf_free(ctx_gguf);
             throw std::runtime_error("model arch and LoRA arch mismatch");
         }
+
         auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
         if (train_type != "finetune_lora") {
+            gguf_free(ctx_gguf);
             throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
         }
     }
 
-    // calculate n_tensors_per_layer
-    int n_tensors_per_layer = 0;
-    {
-        int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
-        for (int i = 0; i < n_tensors; i++) {
-            int il = -1;
-            sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il);
-            if (il == 0) n_tensors_per_layer++;
-        }
-    }
+    int n_tensors = gguf_get_n_tensors(ctx_gguf);
 
-    // count layer buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
-    for (int64_t i = 0; i < model->hparams.n_layer; i++) {
-        buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer;
-    }
-    buft_tensor_count[model->buft_input.buft]  += n_inp_tensors;
-    buft_tensor_count[model->buft_output.buft] += n_out_tensors;
-
-    // allocate contexts
+    // contexts for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    {
-        auto new_ggml_ctx = [](size_t n_tensors) {
+    auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            // add a new context
             struct ggml_init_params params = {
                 /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
             };
-            return ggml_init(params);
+            ggml_context * buft_ctx = ggml_init(params);
+            ctx_map[buft] = buft_ctx;
+            return buft_ctx;
         };
-        for (auto & it : buft_tensor_count) {
-            int n_tensors = it.second;
-            // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
-            ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
-        }
-    }
+        return it->second;
+    };
 
     // bundle lora_a and lora_b into pairs
     std::map<std::string, llama_lora_weight> ab_map;
@@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                 ab_map[name].b = cur;
             }
         } else {
-            // maybe "optimizer.*"" tensors
-            LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
         }
     }
 
     // add tensors
     for (auto & it : ab_map) {
-        std::string name = it.first;
-        const char * cname = name.c_str();
+        const std::string & name = it.first;
         llama_lora_weight & w = it.second;
-        GGML_ASSERT(w.a != nullptr);
-        GGML_ASSERT(w.b != nullptr);
-        int il = -1;
-        sscanf(cname, "blk.%d.", &il);
+
+        if (!w.a || !w.b) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
+        }
+
         // device buft and device ctx
-        auto model_tensor = llama_get_model_tensor(model, cname);
+        auto * model_tensor = llama_get_model_tensor(model, name.c_str());
         if (!model_tensor) {
             gguf_free(ctx_gguf);
             ggml_free(ctx);
             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
         }
-        struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
+        struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
         // validate tensor shape
         if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
             throw std::runtime_error("tensor '" + name + "' has incorrect shape");
         }
         if (w.a->ne[1] != w.b->ne[0]) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
             throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
         }
         // save tensor to adapter
@@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                 ggml_free(ctx);
                 throw std::runtime_error("failed to allocate buffer for lora adapter\n");
             }
-            ggml_backend_buffer_clear(buf, 0);
+            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
             adapter.ctxs.push_back(ctx_dev);
             adapter.bufs.push_back(buf);
         }
@@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
             size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
             size_t size = ggml_nbytes(orig);
-            if (read_buf.size() < size) {
-                read_buf.resize(size);
-            }
+            read_buf.resize(size);
             gguf_file.seek(offs, SEEK_SET);
             gguf_file.read_raw(read_buf.data(), size);
-            // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
             ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
         };
         for (auto & it : adapter.ab_map) {