diff --git a/llama.cpp b/llama.cpp index 0b334b4e9..79b48897d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -927,6 +927,7 @@ enum e_model { static const size_t kB = 1024; static const size_t MB = kB*kB; +static const size_t GB = kB*kB*kB; // default hparams (LLaMA 7B) struct llama_hparams { @@ -1280,6 +1281,7 @@ struct llama_model_loader { int n_created = 0; int64_t n_elements = 0; + size_t n_bytes = 0; bool use_mmap = false; @@ -1312,6 +1314,7 @@ struct llama_model_loader { const char * name = gguf_get_tensor_name(ctx_gguf, i); struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); n_elements += ggml_nelements(t); + n_bytes += ggml_nbytes(t); } LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", @@ -1909,7 +1912,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); - LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9); + LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); + if (ml.n_bytes < GB) { + LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + } else { + LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + } // general kv LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); @@ -3495,7 +3503,7 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_allocr_alloc(lctx.alloc, token); if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, embd, N * n_embd * ggml_element_size(inpL)); + memcpy(token->data, embd, N * n_embd * ggml_element_size(token)); } }