From fc4c2a6fc3569c175028a06ea0f618da1d998325 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 26 Mar 2024 11:53:42 +0200
Subject: [PATCH] quantize: be able to override metadata by key

---
 examples/quantize/quantize.cpp | 58 +++++++++++++++++++++++++++++++++-
 llama.cpp                      | 26 ++++++++++++++-
 llama.h                        |  1 +
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 79e60ea7b..1826dd281 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -87,13 +87,17 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
     printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
     printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
+    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
+    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --override-kv KEY=TYPE:VALUE\n");
+    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
     printf("Note: --include-weights and --exclude-weights cannot be used together\n");
     printf("\nAllowed quantization types:\n");
     for (auto & it : QUANT_OPTIONS) {
@@ -201,6 +205,48 @@ static ggml_type parse_ggml_type(const char * arg) {
     return result;
 }
 
+static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override>& overrides) {
+    const char* sep = strchr(data, '=');
+    if (sep == nullptr || sep - data >= 128) {
+        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+        return false;
+    }
+    llama_model_kv_override kvo;
+    std::strncpy(kvo.key, data, sep - data);
+    kvo.key[sep - data] = 0;
+    sep++;
+    if (strncmp(sep, "int:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.int_value = std::atol(sep);
+    }
+    else if (strncmp(sep, "float:", 6) == 0) {
+        sep += 6;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.float_value = std::atof(sep);
+    }
+    else if (strncmp(sep, "bool:", 5) == 0) {
+        sep += 5;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        if (std::strcmp(sep, "true") == 0) {
+            kvo.bool_value = true;
+        }
+        else if (std::strcmp(sep, "false") == 0) {
+            kvo.bool_value = false;
+        }
+        else {
+            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            return false;
+        }
+    }
+    else {
+        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+        return false;
+    }
+    overrides.emplace_back(std::move(kvo));
+    return true;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -211,6 +257,7 @@ int main(int argc, char ** argv) {
     int arg_idx = 1;
     std::string imatrix_file;
     std::vector<std::string> included_weights, excluded_weights;
+    std::vector<llama_model_kv_override> kv_overrides;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -227,6 +274,10 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
+            if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
             params.allow_requantize = true;
         } else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -267,6 +318,11 @@ int main(int argc, char ** argv) {
     if (!imatrix_data.empty()) {
         params.imatrix = &imatrix_data;
     }
+    if (!kv_overrides.empty()) {
+        kv_overrides.emplace_back();
+        kv_overrides.back().key[0] = 0;
+        params.kv_overrides = &kv_overrides;
+    }
 
     llama_backend_init();
 
diff --git a/llama.cpp b/llama.cpp
index 384021efd..b68ee2973 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12776,7 +12776,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     constexpr bool use_mmap = false;
 #endif
 
-    llama_model_loader ml(fname_inp, use_mmap, NULL);
+    llama_model_kv_override * kv_overrides = nullptr;
+    if (params->kv_overrides) {
+        auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+        kv_overrides = v->data();
+    }
+    llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
     ml.init_mappings(false); // no prefetching?
 
     llama_model model;
@@ -12804,6 +12809,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     gguf_set_kv     (ctx_out, ml.meta);
     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
     gguf_set_val_u32(ctx_out, "general.file_type", ftype);
+    if (params->kv_overrides) {
+        const std::vector<llama_model_kv_override>& overrides = *(const std::vector<llama_model_kv_override>*)params->kv_overrides;
+        for (auto& o : overrides) {
+            if (o.key[0] == 0) break;
+            if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
+                gguf_set_val_f32(ctx_out, o.key, o.float_value);
+            }
+            else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
+                gguf_set_val_i32(ctx_out, o.key, o.int_value);
+            }
+            else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
+                gguf_set_val_bool(ctx_out, o.key, o.bool_value);
+            }
+            else {
+                LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
+            }
+        }
+    }
 
     for (int i = 0; i < ml.n_tensors; ++i) {
         const struct ggml_tensor * meta = ml.get_tensor_meta(i);
@@ -13363,6 +13386,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.only_copy                   =*/ false,
         /*.pure                        =*/ false,
         /*.imatrix                     =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
     };
 
     return result;
diff --git a/llama.h b/llama.h
index 74f0e56de..6daf3a357 100644
--- a/llama.h
+++ b/llama.h
@@ -284,6 +284,7 @@ extern "C" {
         bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         bool pure;                           // quantize all tensors to the default type
         void * imatrix;                      // pointer to importance matrix data
+        void * kv_overrides;                 // pointer to vector containing overrides
     } llama_model_quantize_params;
 
     // grammar types