Merge pull request #9 from ggerganov/sl/fix_fix_lora

fix lora issues
2025-02-05 08:00:42 +01:00 · 2024-07-10 10:33:42 +02:00 · 2024-07-10 10:33:42 +02:00 · 4fe0861a89
commit 4fe0861a89
parent 713665db2e 9841fbda7c
2 changed files with 37 additions and 49 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -1875,7 +1875,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor

    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
+        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
+        && src1->ne[1] == 1;
    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2831,7 +2831,7 @@ struct llama_context {
 struct llama_lora_weight {
    struct ggml_tensor * a = nullptr;
    struct ggml_tensor * b = nullptr;
-    llama_lora_weight() {}
+    llama_lora_weight() = default;
    llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
 };

@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }

 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
-    static const int n_inp_tensors = 5; // see llama_model
-    static const int n_out_tensors = 5; // see llama_model
    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);

    ggml_context * ctx = nullptr;
    struct gguf_init_params meta_gguf_params = {
-        /* .no_alloc = */ false,
+        /* .no_alloc = */ true,
        /* .ctx      = */ &ctx,
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
@ -18536,7 +18534,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
    // check metadata
    {
        auto get_kv_str = [&](std::string key) -> std::string {
-            std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
            int id = gguf_find_key(ctx_gguf, key.c_str());
            return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
        };
@ -18544,50 +18541,36 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
        auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
        auto lora_arch = llm_arch_from_string(lora_arch_name);
        if (lora_arch != model->arch) {
+            gguf_free(ctx_gguf);
            throw std::runtime_error("model arch and LoRA arch mismatch");
        }
+
        auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
        if (train_type != "finetune_lora") {
+            gguf_free(ctx_gguf);
            throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
        }
    }

-    // calculate n_tensors_per_layer
-    int n_tensors_per_layer = 0;
-    {
-        int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
-        for (int i = 0; i < n_tensors; i++) {
-            int il = -1;
-            sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il);
-            if (il == 0) n_tensors_per_layer++;
-        }
-    }
+    int n_tensors = gguf_get_n_tensors(ctx_gguf);

-    // count layer buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
-    for (int64_t i = 0; i < model->hparams.n_layer; i++) {
-        buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer;
-    }
-    buft_tensor_count[model->buft_input.buft]  += n_inp_tensors;
-    buft_tensor_count[model->buft_output.buft] += n_out_tensors;
-
-    // allocate contexts
+    // contexts for each buffer type
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    {
-        auto new_ggml_ctx = [](size_t n_tensors) {
+    auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            // add a new context
            struct ggml_init_params params = {
                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };
-            return ggml_init(params);
+            ggml_context * buft_ctx = ggml_init(params);
+            ctx_map[buft] = buft_ctx;
+            return buft_ctx;
        };
-        for (auto & it : buft_tensor_count) {
-            int n_tensors = it.second;
-            // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
-            ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
-        }
-    }
+        return it->second;
+    };

    // bundle lora_a and lora_b into pairs
    std::map<std::string, llama_lora_weight> ab_map;
@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                ab_map[name].b = cur;
            }
        } else {
-            // maybe "optimizer.*"" tensors
-            LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
        }
    }

    // add tensors
    for (auto & it : ab_map) {
-        std::string name = it.first;
-        const char * cname = name.c_str();
+        const std::string & name = it.first;
        llama_lora_weight & w = it.second;
-        GGML_ASSERT(w.a != nullptr);
-        GGML_ASSERT(w.b != nullptr);
-        int il = -1;
-        sscanf(cname, "blk.%d.", &il);
+
+        if (!w.a || !w.b) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
+        }
+
        // device buft and device ctx
-        auto model_tensor = llama_get_model_tensor(model, cname);
+        auto * model_tensor = llama_get_model_tensor(model, name.c_str());
        if (!model_tensor) {
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
        }
-        struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
+        struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
        // validate tensor shape
        if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
            throw std::runtime_error("tensor '" + name + "' has incorrect shape");
        }
        if (w.a->ne[1] != w.b->ne[0]) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
            throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
        }
        // save tensor to adapter
@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                ggml_free(ctx);
                throw std::runtime_error("failed to allocate buffer for lora adapter\n");
            }
-            ggml_backend_buffer_clear(buf, 0);
+            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
            adapter.ctxs.push_back(ctx_dev);
            adapter.bufs.push_back(buf);
        }
@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
        auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
            size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
            size_t size = ggml_nbytes(orig);
-            if (read_buf.size() < size) {
-                read_buf.resize(size);
-            }
+            read_buf.resize(size);
            gguf_file.seek(offs, SEEK_SET);
            gguf_file.read_raw(read_buf.data(), size);
-            // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
            ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
        };
        for (auto & it : adapter.ab_map) {