gguf : add todos and comments

2025-01-06 02:48:57 +01:00 · 2023-08-14 08:50:47 +03:00 · 2023-08-14 08:50:47 +03:00 · 196b50fee7
commit 196b50fee7
parent 24f48833ab
1 changed files with 36 additions and 28 deletions
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
@ -258,6 +258,11 @@ struct llama_kv_cache {
 };

 struct llama_vocab {
+    // TODO: convert to this gguf_vocab
+    // add a vector of merges
+    // add members for bos/eos/pad/sep tokens
+    // so that we can pass it to different types of tokenizers with a common interface
+
    using id    = int32_t;
    using token = std::string;

@ -447,7 +452,7 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
    return size / ggml_blck_size(type);
 }

-struct llama_load_tensor {
+struct gguf_load_tensor {
    std::string name;
    enum ggml_type type = GGML_TYPE_F32;
    std::vector<uint32_t> ne;
@ -457,9 +462,9 @@ struct llama_load_tensor {
    uint8_t * data;
 };

-struct llama_load_tensors_map {
+struct gguf_load_tensors_map {
    // tensors is kept in a separate vector to preserve file order
-    std::vector<llama_load_tensor> tensors;
+    std::vector<gguf_load_tensor> tensors;
    std::unordered_map<std::string, size_t> name_to_idx;
 };

@ -477,7 +482,7 @@ struct gguf_file_loader {
    llama_vocab vocab;
 struct ggml_context * ctx_data = NULL;

-    gguf_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
+    gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map)
        : file(fname, "rb") {
        fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
        
@ -523,13 +528,9 @@ struct ggml_context * ctx_data = NULL;

    void read_hparams() {

-        // TODO make keysconstants in header
+        // TODO define keys as constants in header
        // TODO: read all hparams from file
-        int q_ver_idx = gguf_find_key (gguf_ctx, "general.quantization_version");
-        if (q_ver_idx != -1) {
-            hparams.ftype = (enum llama_ftype) gguf_get_val_u32(gguf_ctx, q_ver_idx);
-        }
-
+            
        hparams.n_vocab = read_n_vocab();
        hparams.n_ctx   = read_u32("llama.context_length");
        hparams.n_embd  = read_u32("llama.embedding_length");
@ -567,11 +568,11 @@ struct ggml_context * ctx_data = NULL;
        }
    }

-    void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
+    void read_tensor_metadata(gguf_load_tensors_map & tensors_map) {
        const int n_tensors = gguf_get_n_tensors(gguf_ctx);

        for (int i = 0; i < n_tensors; ++i) {
-            llama_load_tensor tensor;
+            gguf_load_tensor tensor;
            const char * name = gguf_get_tensor_name(gguf_ctx, i);

            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
@ -617,6 +618,13 @@ struct ggml_context * ctx_data = NULL;
 };

 struct gguf_file_saver {
+    // TODO
+    // this implementation now assumes that the data section is of the same length as the unquantized model.
+    // this is needed to write tensor metadata and weights in a single pass by seeking to appropriate positions in the file.
+    // this may not be true when we add quantization version and change ftype description (currently it's string according to the specs,
+    // but better to have it as uint32).
+    // we need to calculate the delta in number of bytes written with a counter as a struct member.
+    
    gguf_file file;
    gguf_file_loader * fl;
    size_t info_offset;
@ -747,7 +755,7 @@ struct gguf_file_saver {
        GGML_ASSERT(info_offset == file.tell());
    }

-    size_t write_tensor_info(llama_load_tensor & tensor, enum ggml_type type) {
+    size_t write_tensor_info(gguf_load_tensor & tensor, enum ggml_type type) {
        size_t total_written = 0;
        file.seek(info_offset, SEEK_SET);
        GGML_ASSERT(info_offset == file.tell());
@ -761,14 +769,14 @@ struct gguf_file_saver {

        total_written += file.write_i32(type);
        total_written += file.write_u64(tensor_offset);
-        info_offset   += total_written;
+        info_offset   += total_written; // position to write info of the next tensor

        file.seek(0, SEEK_END);
        
        return total_written;
    }

-    void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
+    void write_tensor(gguf_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
        switch (new_type) {
            case GGML_TYPE_F32:
            case GGML_TYPE_F16:
@ -791,13 +799,13 @@ struct gguf_file_saver {
        size_t padded_size = GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT); // TODO: handle custom alignment
        size_t pad = padded_size - new_size;
        file.write_zeros(pad);
-        tensor_offset += padded_size;
+        tensor_offset += padded_size; // offset of the next tensor
    }
 };

 struct llama_model_loader {
    std::unique_ptr<gguf_file_loader> file_loader;
-    llama_load_tensors_map tensors_map;
+    gguf_load_tensors_map tensors_map;
    bool use_mmap;
    size_t num_ggml_tensors_created = 0;
    struct ggml_context * ggml_ctx = NULL;
@ -813,7 +821,7 @@ struct llama_model_loader {

    void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
        *ctx_size_p = *mmapped_size_p = 0;
-        for (const llama_load_tensor & lt : tensors_map.tensors) {
+        for (const gguf_load_tensor & lt : tensors_map.tensors) {
            *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
        }
@ -824,7 +832,7 @@ struct llama_model_loader {
        if (it == tensors_map.name_to_idx.end()) {
            throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
        }
-        llama_load_tensor & lt = tensors_map.tensors.at(it->second);
+        gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
        if (lt.ne != ne) {
            throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
                         name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
@ -833,7 +841,7 @@ struct llama_model_loader {
        return get_tensor_for(lt, backend);
    }

-    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
+    struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) {
        struct ggml_tensor * tensor;
        if (backend != GGML_BACKEND_CPU) {
            ggml_set_no_alloc(ggml_ctx, true);
@ -866,7 +874,7 @@ struct llama_model_loader {
        size_t data_size = 0;
        size_t prefetch_size = 0;
        size_t lock_size = 0;
-        for (const llama_load_tensor & lt : tensors_map.tensors) {
+        for (const gguf_load_tensor & lt : tensors_map.tensors) {
            data_size += lt.size;
            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
                prefetch_size += lt.size;
@ -881,7 +889,7 @@ struct llama_model_loader {
        }

        size_t done_size = 0;
-        for (llama_load_tensor & lt : tensors_map.tensors) {
+        for (gguf_load_tensor & lt : tensors_map.tensors) {
            if (progress_callback) {
                progress_callback((float) done_size / data_size, progress_callback_user_data);
            }
@ -928,7 +936,7 @@ struct llama_model_loader {
        }
    }

-    void load_data_for(llama_load_tensor & lt) {
+    void load_data_for(gguf_load_tensor & lt) {
        if (use_mmap) {
            lt.data = (uint8_t *) mapping->addr + lt.file_off;
        } else {
@ -942,7 +950,7 @@ struct llama_model_loader {
        }
    }

-    static void print_checksum(llama_load_tensor & lt) {
+    static void print_checksum(gguf_load_tensor & lt) {
        uint32_t sum = 0;
        for (size_t i = 0; i < lt.size; i++) {
            uint8_t byte = lt.data[i];
@ -1421,7 +1429,7 @@ static void llama_model_load_internal(
    }

    // populate `tensors_by_name`
-    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+    for (gguf_load_tensor & lt : ml->tensors_map.tensors) {
        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
    }

@ -2896,7 +2904,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
 // quantization
 //

-static void llama_convert_tensor_internal(const llama_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
+static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
    if (output.size < nelements * sizeof(float)) {
        output.resize(nelements * sizeof(float));
    }
@ -3018,7 +3026,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    };

    size_t idx = 0;
-    for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
+    for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
        gguf_buffer read_data;
        read_data.resize(tensor.size);
        tensor.data = read_data.addr;
@ -3570,7 +3578,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
                    return 1;
                }
                size_t idx = model_loader->tensors_map.name_to_idx[base_name];
-                llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
+                gguf_load_tensor & lt = model_loader->tensors_map.tensors[idx];
                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
                lt.data = (uint8_t *) lt.ggml_tensor->data;
                model_loader->load_data_for(lt);