gguf : add todos and comments

This commit is contained in:
M. Yusuf Sarıgöz 2023-08-14 08:50:47 +03:00
parent 24f48833ab
commit 196b50fee7

View File

@ -258,6 +258,11 @@ struct llama_kv_cache {
}; };
struct llama_vocab { struct llama_vocab {
// TODO: convert to this gguf_vocab
// add a vector of merges
// add members for bos/eos/pad/sep tokens
// so that we can pass it to different types of tokenizers with a common interface
using id = int32_t; using id = int32_t;
using token = std::string; using token = std::string;
@ -447,7 +452,7 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
return size / ggml_blck_size(type); return size / ggml_blck_size(type);
} }
struct llama_load_tensor { struct gguf_load_tensor {
std::string name; std::string name;
enum ggml_type type = GGML_TYPE_F32; enum ggml_type type = GGML_TYPE_F32;
std::vector<uint32_t> ne; std::vector<uint32_t> ne;
@ -457,9 +462,9 @@ struct llama_load_tensor {
uint8_t * data; uint8_t * data;
}; };
struct llama_load_tensors_map { struct gguf_load_tensors_map {
// tensors is kept in a separate vector to preserve file order // tensors is kept in a separate vector to preserve file order
std::vector<llama_load_tensor> tensors; std::vector<gguf_load_tensor> tensors;
std::unordered_map<std::string, size_t> name_to_idx; std::unordered_map<std::string, size_t> name_to_idx;
}; };
@ -477,7 +482,7 @@ struct gguf_file_loader {
llama_vocab vocab; llama_vocab vocab;
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
gguf_file_loader(const char * fname, llama_load_tensors_map & tensors_map) gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map)
: file(fname, "rb") { : file(fname, "rb") {
fprintf(stderr, "llama.cpp: loading model from %s\n", fname); fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
@ -523,13 +528,9 @@ struct ggml_context * ctx_data = NULL;
void read_hparams() { void read_hparams() {
// TODO make keysconstants in header // TODO define keys as constants in header
// TODO: read all hparams from file // TODO: read all hparams from file
int q_ver_idx = gguf_find_key (gguf_ctx, "general.quantization_version");
if (q_ver_idx != -1) {
hparams.ftype = (enum llama_ftype) gguf_get_val_u32(gguf_ctx, q_ver_idx);
}
hparams.n_vocab = read_n_vocab(); hparams.n_vocab = read_n_vocab();
hparams.n_ctx = read_u32("llama.context_length"); hparams.n_ctx = read_u32("llama.context_length");
hparams.n_embd = read_u32("llama.embedding_length"); hparams.n_embd = read_u32("llama.embedding_length");
@ -567,11 +568,11 @@ struct ggml_context * ctx_data = NULL;
} }
} }
void read_tensor_metadata(llama_load_tensors_map & tensors_map) { void read_tensor_metadata(gguf_load_tensors_map & tensors_map) {
const int n_tensors = gguf_get_n_tensors(gguf_ctx); const int n_tensors = gguf_get_n_tensors(gguf_ctx);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
llama_load_tensor tensor; gguf_load_tensor tensor;
const char * name = gguf_get_tensor_name(gguf_ctx, i); const char * name = gguf_get_tensor_name(gguf_ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
@ -617,6 +618,13 @@ struct ggml_context * ctx_data = NULL;
}; };
struct gguf_file_saver { struct gguf_file_saver {
// TODO
// this implementation now assumes that the data section is of the same length as the unquantized model.
// this is needed to write tensor metadata and weights in a single pass by seeking to appropriate positions in the file.
// this may not be true when we add quantization version and change ftype description (currently it's string according to the specs,
// but better to have it as uint32).
// we need to calculate the delta in number of bytes written with a counter as a struct member.
gguf_file file; gguf_file file;
gguf_file_loader * fl; gguf_file_loader * fl;
size_t info_offset; size_t info_offset;
@ -747,7 +755,7 @@ struct gguf_file_saver {
GGML_ASSERT(info_offset == file.tell()); GGML_ASSERT(info_offset == file.tell());
} }
size_t write_tensor_info(llama_load_tensor & tensor, enum ggml_type type) { size_t write_tensor_info(gguf_load_tensor & tensor, enum ggml_type type) {
size_t total_written = 0; size_t total_written = 0;
file.seek(info_offset, SEEK_SET); file.seek(info_offset, SEEK_SET);
GGML_ASSERT(info_offset == file.tell()); GGML_ASSERT(info_offset == file.tell());
@ -761,14 +769,14 @@ struct gguf_file_saver {
total_written += file.write_i32(type); total_written += file.write_i32(type);
total_written += file.write_u64(tensor_offset); total_written += file.write_u64(tensor_offset);
info_offset += total_written; info_offset += total_written; // position to write info of the next tensor
file.seek(0, SEEK_END); file.seek(0, SEEK_END);
return total_written; return total_written;
} }
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { void write_tensor(gguf_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
switch (new_type) { switch (new_type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
case GGML_TYPE_F16: case GGML_TYPE_F16:
@ -791,13 +799,13 @@ struct gguf_file_saver {
size_t padded_size = GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT); // TODO: handle custom alignment size_t padded_size = GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT); // TODO: handle custom alignment
size_t pad = padded_size - new_size; size_t pad = padded_size - new_size;
file.write_zeros(pad); file.write_zeros(pad);
tensor_offset += padded_size; tensor_offset += padded_size; // offset of the next tensor
} }
}; };
struct llama_model_loader { struct llama_model_loader {
std::unique_ptr<gguf_file_loader> file_loader; std::unique_ptr<gguf_file_loader> file_loader;
llama_load_tensors_map tensors_map; gguf_load_tensors_map tensors_map;
bool use_mmap; bool use_mmap;
size_t num_ggml_tensors_created = 0; size_t num_ggml_tensors_created = 0;
struct ggml_context * ggml_ctx = NULL; struct ggml_context * ggml_ctx = NULL;
@ -813,7 +821,7 @@ struct llama_model_loader {
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
*ctx_size_p = *mmapped_size_p = 0; *ctx_size_p = *mmapped_size_p = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { for (const gguf_load_tensor & lt : tensors_map.tensors) {
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16; *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
} }
@ -824,7 +832,7 @@ struct llama_model_loader {
if (it == tensors_map.name_to_idx.end()) { if (it == tensors_map.name_to_idx.end()) {
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str()))); throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
} }
llama_load_tensor & lt = tensors_map.tensors.at(it->second); gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
if (lt.ne != ne) { if (lt.ne != ne) {
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str())); name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
@ -833,7 +841,7 @@ struct llama_model_loader {
return get_tensor_for(lt, backend); return get_tensor_for(lt, backend);
} }
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) { struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor; struct ggml_tensor * tensor;
if (backend != GGML_BACKEND_CPU) { if (backend != GGML_BACKEND_CPU) {
ggml_set_no_alloc(ggml_ctx, true); ggml_set_no_alloc(ggml_ctx, true);
@ -866,7 +874,7 @@ struct llama_model_loader {
size_t data_size = 0; size_t data_size = 0;
size_t prefetch_size = 0; size_t prefetch_size = 0;
size_t lock_size = 0; size_t lock_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { for (const gguf_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size; data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
prefetch_size += lt.size; prefetch_size += lt.size;
@ -881,7 +889,7 @@ struct llama_model_loader {
} }
size_t done_size = 0; size_t done_size = 0;
for (llama_load_tensor & lt : tensors_map.tensors) { for (gguf_load_tensor & lt : tensors_map.tensors) {
if (progress_callback) { if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data); progress_callback((float) done_size / data_size, progress_callback_user_data);
} }
@ -928,7 +936,7 @@ struct llama_model_loader {
} }
} }
void load_data_for(llama_load_tensor & lt) { void load_data_for(gguf_load_tensor & lt) {
if (use_mmap) { if (use_mmap) {
lt.data = (uint8_t *) mapping->addr + lt.file_off; lt.data = (uint8_t *) mapping->addr + lt.file_off;
} else { } else {
@ -942,7 +950,7 @@ struct llama_model_loader {
} }
} }
static void print_checksum(llama_load_tensor & lt) { static void print_checksum(gguf_load_tensor & lt) {
uint32_t sum = 0; uint32_t sum = 0;
for (size_t i = 0; i < lt.size; i++) { for (size_t i = 0; i < lt.size; i++) {
uint8_t byte = lt.data[i]; uint8_t byte = lt.data[i];
@ -1421,7 +1429,7 @@ static void llama_model_load_internal(
} }
// populate `tensors_by_name` // populate `tensors_by_name`
for (llama_load_tensor & lt : ml->tensors_map.tensors) { for (gguf_load_tensor & lt : ml->tensors_map.tensors) {
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
} }
@ -2896,7 +2904,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
// quantization // quantization
// //
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) { static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
if (output.size < nelements * sizeof(float)) { if (output.size < nelements * sizeof(float)) {
output.resize(nelements * sizeof(float)); output.resize(nelements * sizeof(float));
} }
@ -3018,7 +3026,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}; };
size_t idx = 0; size_t idx = 0;
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
gguf_buffer read_data; gguf_buffer read_data;
read_data.resize(tensor.size); read_data.resize(tensor.size);
tensor.data = read_data.addr; tensor.data = read_data.addr;
@ -3570,7 +3578,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
return 1; return 1;
} }
size_t idx = model_loader->tensors_map.name_to_idx[base_name]; size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; gguf_load_tensor & lt = model_loader->tensors_map.tensors[idx];
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
lt.data = (uint8_t *) lt.ggml_tensor->data; lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt); model_loader->load_data_for(lt);