diff --git a/CMakeLists.txt b/CMakeLists.txt index dff4942cd..c6ed458b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -529,7 +529,6 @@ endif() add_library(llama llama.cpp llama.h - llama-util.h ) target_include_directories(llama PUBLIC .) diff --git a/Makefile b/Makefile index ce36fcf6b..4d2a8cfa9 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gguf-llama-simple gptneox-main +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gptneox-main # Binaries only useful for tests TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0 @@ -329,10 +329,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h OBJS += ggml-alloc.o -llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h +llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h @@ -388,10 +385,7 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput -gguf: examples/gguf/gguf.cpp build-info.h ggml.o gguf-llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) - -gguf-llama-simple: examples/gguf/gguf-llama-simple.cpp build-info.h ggml.o gguf-llama.o common.o $(OBJS) +gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) gptneox-main: gptneox-main.cpp ggml.o $(OBJS) diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py index 53ab5a3ed..27841939d 100644 --- a/convert-llama-7b-pth-to-gguf.py +++ b/convert-llama-7b-pth-to-gguf.py @@ -132,7 +132,7 @@ if Path(dir_model + "/tokenizer.model").is_file(): toktype = 1 # defualt to normal token type if tokenizer.is_unknown(i): toktype = 2 if tokenizer.is_control(i): toktype = 3 - + # TODO: How to determinate if a token is user defined? # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # if tokenizer.is_user_defined(i): toktype = 4 @@ -223,7 +223,7 @@ for part_name in part_names: sys.exit() n_dims = len(data.shape) - data_dtype = data.dtype + data_dtype = data.dtype # if f32 desired, convert any float16 to float32 if ftype == 0 and data.dtype == np.float16: @@ -261,7 +261,6 @@ for part_name in part_names: for name in model_part.keys(): data = model_part[name] - old_dtype = data.dtype # we don't need these @@ -284,7 +283,7 @@ for part_name in part_names: sys.exit() n_dims = len(data.shape) - data_dtype = data.dtype + data_dtype = data.dtype # if f32 desired, convert any float16 to float32 if ftype == 0 and data.dtype == np.float16: diff --git a/convert-llama-h5-to-gguf.py b/convert-llama-h5-to-gguf.py index d7706f618..35f85bb97 100644 --- a/convert-llama-h5-to-gguf.py +++ b/convert-llama-h5-to-gguf.py @@ -95,12 +95,21 @@ if "_name_or_path" in hparams: else: hf_repo="" +if "max_sequence_length" in hparams: + ctx_length = hparams["max_sequence_length"] +elif "max_position_embeddings" in hparams: + ctx_length = hparams["max_position_embeddings"] +else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + gguf_writer.add_architecture(llm_arch) gguf_writer.add_name(last_dir) gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32") gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth") -gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"]) +gguf_writer.add_context_length(llm_arch, ctx_length) gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"]) gguf_writer.add_block_count(llm_arch, block_count) gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"]) @@ -318,7 +327,7 @@ for part_name in part_names: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype)) gguf_writer.write_tensor_to_file(data) diff --git a/examples/common.cpp b/examples/common.cpp index 18de8ef43..8beb63f36 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_ctx = std::stoi(argv[i]); - } else if (arg == "-gqa" || arg == "--gqa") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_gqa = std::stoi(argv[i]); - } else if (arg == "-eps" || arg == "--rms-norm-eps") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rms_norm_eps = std::stof(argv[i]); } else if (arg == "--rope-freq-base") { if (++i >= argc) { invalid_param = true; @@ -546,8 +534,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); - fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps); fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); @@ -638,8 +624,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param lparams.n_ctx = params.n_ctx; lparams.n_batch = params.n_batch; - lparams.n_gqa = params.n_gqa; - lparams.rms_norm_eps = params.rms_norm_eps; lparams.n_gpu_layers = params.n_gpu_layers; lparams.main_gpu = params.main_gpu; lparams.tensor_split = params.tensor_split; diff --git a/examples/common.h b/examples/common.h index 0e520ed4e..b8510dfe4 100644 --- a/examples/common.h +++ b/examples/common.h @@ -23,14 +23,12 @@ struct gpt_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_scale = 1.0f; // RoPE frequency scaling factor diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 1a238c4dd..af493e15b 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "llama.h" + #include #include #include @@ -502,7 +503,7 @@ bool is_ggml_file(const char *filename) { return false; } uint32_t magic = file.read_u32(); - return magic == LLAMA_FILE_MAGIC; + return magic == GGUF_MAGIC; } void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { @@ -590,75 +591,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod if (file.fp == NULL) { return; } - // write_magic - file.write_u32(LLAMA_FILE_MAGIC); // magic - file.write_u32(LLAMA_FILE_VERSION); // version - // write_hparams - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - file.write_u32(LLAMA_FTYPE_ALL_F32); - // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. - uint32_t n_vocab = model->hparams.n_vocab; - for (uint32_t i = 0; i < n_vocab; i++) { - const auto & token_score = vocab->id_to_token.at(i); - file.write_u32((uint32_t) token_score.tok.size()); - file.write_raw(token_score.tok.data(), token_score.tok.size()); - file.write_raw(&token_score.score, sizeof(token_score.score)); - } - - // stuff AK weights into GG weights one by one. - // w->token_embedding_table -> model->tok_embeddings - // float* -> struct ggml_tensor - stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); - stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table); - - stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); - //print_row(model->norm, 0); - - // for rms-att-weight - int row_length = model->hparams.n_embd; - const auto & hparams = model->hparams; - //int n_ff = model->hparams.n_embd; - int n_ff = get_n_ff(&hparams); - - for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ - auto & layer = model->layers[i]; - // 1d - stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); - stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); - - // from 3d matrix layer x dim x dim to 2d matrix dim x dim - stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); - - stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); - stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); - stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); - } - // write tensors - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); // ? - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); - } +#pragma message("TODO: implement file saving using gguf") + (void) vocab; + (void) model; + (void) w; +// // write_magic +// file.write_u32(LLAMA_FILE_MAGIC); // magic +// file.write_u32(LLAMA_FILE_VERSION); // version +// // write_hparams +// file.write_u32(model->hparams.n_vocab); +// file.write_u32(model->hparams.n_embd); +// file.write_u32(model->hparams.n_mult); +// file.write_u32(model->hparams.n_head); +// file.write_u32(model->hparams.n_layer); +// file.write_u32(model->hparams.n_rot); +// file.write_u32(LLAMA_FTYPE_ALL_F32); +// +// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. +// uint32_t n_vocab = model->hparams.n_vocab; +// for (uint32_t i = 0; i < n_vocab; i++) { +// const auto & token_score = vocab->id_to_token.at(i); +// file.write_u32((uint32_t) token_score.tok.size()); +// file.write_raw(token_score.tok.data(), token_score.tok.size()); +// file.write_raw(&token_score.score, sizeof(token_score.score)); +// } +// +// // stuff AK weights into GG weights one by one. +// // w->token_embedding_table -> model->tok_embeddings +// // float* -> struct ggml_tensor +// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); +// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table); +// +// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); +// //print_row(model->norm, 0); +// +// // for rms-att-weight +// int row_length = model->hparams.n_embd; +// const auto & hparams = model->hparams; +// //int n_ff = model->hparams.n_embd; +// int n_ff = get_n_ff(&hparams); +// +// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ +// auto & layer = model->layers[i]; +// // 1d +// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); +// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); +// +// // from 3d matrix layer x dim x dim to 2d matrix dim x dim +// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); +// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]); +// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); +// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); +// +// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); +// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); +// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); +// } +// // write tensors +// write_tensor(&file, model->tok_embeddings); +// write_tensor(&file, model->norm); +// write_tensor(&file, model->output); // ? +// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { +// auto & layer = model->layers[i]; +// +// write_tensor(&file, layer.attention_norm); +// write_tensor(&file, layer.wq); +// write_tensor(&file, layer.wk); +// write_tensor(&file, layer.wv); +// write_tensor(&file, layer.wo); +// write_tensor(&file, layer.ffn_norm); +// write_tensor(&file, layer.w1); +// write_tensor(&file, layer.w2); +// write_tensor(&file, layer.w3); +// } } struct train_params get_default_train_params() { diff --git a/examples/gguf/gguf-llama-simple.cpp b/examples/gguf/gguf-llama-simple.cpp deleted file mode 100644 index e59d1cfc1..000000000 --- a/examples/gguf/gguf-llama-simple.cpp +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - -#include "common.h" -#include "gguf-llama.h" -#include "build-info.h" - -#include -#include -#include -#include - -int main(int argc, char ** argv) { - gpt_params params; - - if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); - return 1 ; - } - - if (argc >= 2) { - params.model = argv[1]; - } - - if (argc >= 3) { - params.prompt = argv[2]; - } - - if (params.prompt.empty()) { - params.prompt = "Hello my name is"; - } - - // init LLM - - llama_backend_init(params.numa); - - llama_context_params ctx_params = llama_context_default_params(); - - llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); - - if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); - return 1; - } - - llama_context * ctx = llama_new_context_with_model(model, ctx_params); - - // tokenize the prompt - - std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); - - const int max_context_size = llama_n_ctx(ctx); - const int max_tokens_list_size = max_context_size - 4; - - if ((int) tokens_list.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); - return 1; - } - - fprintf(stderr, "\n\n"); - - for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str()); - } - - fflush(stderr); - - // main loop - - // The LLM keeps a contextual cache memory of previous token evaluation. - // Usually, once this cache is full, it is required to recompute a compressed context based on previous - // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist - // example, we will just stop the loop once this cache is full or once an end of stream is detected. - - const int n_gen = std::min(32, max_context_size); - - while (llama_get_kv_cache_token_count(ctx) < n_gen) { - // evaluate the transformer - - if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - - tokens_list.clear(); - - // sample the next token - - llama_token new_token_id = 0; - - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); - - std::vector candidates; - candidates.reserve(n_vocab); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - new_token_id = llama_sample_token_greedy(ctx , &candidates_p); - - // is it an end of stream ? - if (new_token_id == llama_token_eos()) { - fprintf(stderr, " [end of text]\n"); - break; - } - - // print the new token : - printf("%s", llama_token_to_str(ctx, new_token_id).c_str()); - fflush(stdout); - - // push this new token for next evaluation - tokens_list.push_back(new_token_id); - } - - llama_free(ctx); - llama_free_model(model); - - llama_backend_free(); - - fprintf(stderr, "\n\n"); - - return 0; -} diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index ad212d752..d742dce17 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -1,5 +1,5 @@ #include "ggml.h" -#include "gguf-llama.h" +#include "llama.h" #include #include diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 9c15febb5..5c2f64883 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -266,9 +266,6 @@ int main(int argc, char ** argv) { params.interactive = true; } - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); - if (params.verbose_prompt) { fprintf(stderr, "\n"); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); @@ -778,8 +775,7 @@ int main(int argc, char ** argv) { if (grammar != NULL) { llama_grammar_free(grammar); - std::vector grammar_rules( - parsed_grammar.c_rules()); + std::vector grammar_rules( parsed_grammar.c_rules()); grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 744f549c5..f628d0642 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: } // usage: -// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] +// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // void usage(const char * executable) { - fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable); + fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); fprintf(stderr, "\nAllowed quantization types:\n"); @@ -118,8 +118,8 @@ int main(int argc, char ** argv) { if (pos != std::string::npos) { fpath = fname_inp.substr(0, pos + 1); } - // export as [inp path]/ggml-model-[ftype].bin - fname_out = fpath + "ggml-model-" + ftype_str + ".bin"; + // export as [inp path]/ggml-model-[ftype].gguf + fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; arg_idx++; } else { diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index d5a81978e..3db61b754 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -26,7 +26,6 @@ int main(int argc, char ** argv) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; - lparams.n_gqa = params.n_gqa; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.use_mmap = params.use_mmap; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 222dbcb43..144372061 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -651,8 +651,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); - fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); @@ -773,23 +771,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_ctx = std::stoi(argv[i]); } - else if (arg == "-gqa" || arg == "--gqa") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.n_gqa = std::stoi(argv[i]); - } - else if (arg == "-eps" || arg == "--rms-norm-eps") { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.rms_norm_eps = std::stof(argv[i]); - } else if (arg == "--rope-freq-base") { if (++i >= argc) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 0dfcd7a44..476b31b2e 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -36,16 +36,17 @@ int main(int argc, char ** argv) { llama_backend_init(params.numa); - llama_model * model; - llama_context * ctx; + llama_context_params ctx_params = llama_context_default_params(); - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + // tokenize the prompt std::vector tokens_list; @@ -54,7 +55,7 @@ int main(int argc, char ** argv) { const int max_context_size = llama_n_ctx(ctx); const int max_tokens_list_size = max_context_size - 4; - if ((int)tokens_list.size() > max_tokens_list_size) { + if ((int) tokens_list.size() > max_tokens_list_size) { fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); return 1; } @@ -74,7 +75,9 @@ int main(int argc, char ** argv) { // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist // example, we will just stop the loop once this cache is full or once an end of stream is detected. - while (llama_get_kv_cache_token_count( ctx ) < max_context_size) { + const int n_gen = std::min(32, max_context_size); + + while (llama_get_kv_cache_token_count(ctx) < n_gen) { // evaluate the transformer if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { @@ -114,7 +117,6 @@ int main(int argc, char ** argv) { // push this new token for next evaluation tokens_list.push_back(new_token_id); - } llama_free(ctx); @@ -122,5 +124,7 @@ int main(int argc, char ** argv) { llama_backend_free(); + fprintf(stderr, "\n\n"); + return 0; } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index d446c6ea5..79599951c 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -17,7 +17,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; +static const float rms_norm_eps = 1e-5f; struct random_normal_distribution { std::mt19937 gen; @@ -2612,42 +2612,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod return; } - // write_magic - file.write_u32(LLAMA_FILE_MAGIC); // magic - file.write_u32(LLAMA_FILE_VERSION); // version - // write_hparams - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - file.write_u32(LLAMA_FTYPE_ALL_F32); - // write_vocab - uint32_t n_vocab = model->hparams.n_vocab; - for (uint32_t i = 0; i < n_vocab; i++) { - const auto & token_score = vocab->id_to_token.at(i); - file.write_u32((uint32_t) token_score.tok.size()); - file.write_raw(token_score.tok.data(), token_score.tok.size()); - file.write_raw(&token_score.score, sizeof(token_score.score)); - } - // write tensors - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); - } +#pragma message("TODO: implement file saving using gguf") + (void) vocab; + (void) model; +// // write_magic +// file.write_u32(LLAMA_FILE_MAGIC); // magic +// file.write_u32(LLAMA_FILE_VERSION); // version +// // write_hparams +// file.write_u32(model->hparams.n_vocab); +// file.write_u32(model->hparams.n_embd); +// file.write_u32(model->hparams.n_mult); +// file.write_u32(model->hparams.n_head); +// file.write_u32(model->hparams.n_layer); +// file.write_u32(model->hparams.n_rot); +// file.write_u32(LLAMA_FTYPE_ALL_F32); +// // write_vocab +// uint32_t n_vocab = model->hparams.n_vocab; +// for (uint32_t i = 0; i < n_vocab; i++) { +// const auto & token_score = vocab->id_to_token.at(i); +// file.write_u32((uint32_t) token_score.tok.size()); +// file.write_raw(token_score.tok.data(), token_score.tok.size()); +// file.write_raw(&token_score.score, sizeof(token_score.score)); +// } +// // write tensors +// write_tensor(&file, model->tok_embeddings); +// write_tensor(&file, model->norm); +// write_tensor(&file, model->output); +// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { +// auto & layer = model->layers[i]; +// +// write_tensor(&file, layer.attention_norm); +// write_tensor(&file, layer.wq); +// write_tensor(&file, layer.wk); +// write_tensor(&file, layer.wv); +// write_tensor(&file, layer.wo); +// write_tensor(&file, layer.ffn_norm); +// write_tensor(&file, layer.w1); +// write_tensor(&file, layer.w2); +// write_tensor(&file, layer.w3); +// } } float cosine_decay(const int decay_steps, const float alpha, int step) { diff --git a/ggml.c b/ggml.c index 77f57a3fd..f7e72ed84 100644 --- a/ggml.c +++ b/ggml.c @@ -9140,6 +9140,8 @@ static void ggml_compute_forward_mul( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); + switch (src0->type) { case GGML_TYPE_F32: { @@ -18584,17 +18586,18 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = "uint8", - [GGUF_TYPE_INT8] = "int8", - [GGUF_TYPE_UINT16] = "uint16", - [GGUF_TYPE_INT16] = "int16", - [GGUF_TYPE_UINT32] = "uint32", - [GGUF_TYPE_INT32] = "int32", - [GGUF_TYPE_FLOAT32] = "float32", + [GGUF_TYPE_UINT8] = "u8", + [GGUF_TYPE_INT8] = "i8", + [GGUF_TYPE_UINT16] = "u16", + [GGUF_TYPE_INT16] = "i16", + [GGUF_TYPE_UINT32] = "u32", + [GGUF_TYPE_INT32] = "i32", + [GGUF_TYPE_FLOAT32] = "f32", [GGUF_TYPE_BOOL] = "bool", - [GGUF_TYPE_STRING] = "string", - [GGUF_TYPE_ARRAY] = "array", + [GGUF_TYPE_STRING] = "str", + [GGUF_TYPE_ARRAY] = "arr", }; +static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); union gguf_value { uint8_t uint8; @@ -19395,17 +19398,23 @@ static void gguf_buf_grow(struct gguf_buf * buf, size_t size) { static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) { gguf_buf_grow(buf, sizeof(val->n) + val->n); - buf->data && memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n)); + if (buf->data) { + memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n)); + } buf->offset += sizeof(val->n); - buf->data && memcpy((char *) buf->data + buf->offset, val->data, val->n); + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val->data, val->n); + } buf->offset += val->n; } static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) { gguf_buf_grow(buf, el_size); - buf->data && memcpy((char *) buf->data + buf->offset, val, el_size); + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val, el_size); + } buf->offset += el_size; } diff --git a/gguf-llama.cpp b/gguf-llama.cpp deleted file mode 100644 index d99d752ec..000000000 --- a/gguf-llama.cpp +++ /dev/null @@ -1,4968 +0,0 @@ -// Defines fileno on msys: -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#include -#include -#include -#endif - -#define LLAMA_API_CPP // TODO: eliminate me -#include "gguf-llama.h" - -#include "ggml.h" - -#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) -# include "ggml-alloc.h" -# define LLAMA_USE_ALLOCATOR -#else -# define LLAMA_USE_SCRATCH -# define LLAMA_MAX_SCRATCH_BUFFERS 16 -#endif - -#ifdef GGML_USE_CUBLAS -# include "ggml-cuda.h" -#elif defined(GGML_USE_CLBLAST) -# include "ggml-opencl.h" -#endif - -#ifdef GGML_USE_METAL -# include "ggml-metal.h" -#endif -#ifdef GGML_USE_MPI -# include "ggml-mpi.h" -#endif -#ifdef GGML_USE_K_QUANTS -# ifndef QK_K -# ifdef GGML_QKK_64 -# define QK_K 64 -# else -# define QK_K 256 -# endif -# endif -#endif - -#ifdef __has_include - #if __has_include() - #include - #if defined(_POSIX_MAPPED_FILES) - #include - #endif - #if defined(_POSIX_MEMLOCK_RANGE) - #include - #endif - #endif -#endif - -#if defined(_WIN32) - #define WIN32_LEAN_AND_MEAN - #ifndef NOMINMAX - #define NOMINMAX - #endif - #include - #include - #include // for _fseeki64 -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// tensor names -#define TN_TOKEN_EMBD "token_embd.weight" -#define TN_OUTPUT_NORM "output_norm.weight" -#define TN_OUTPUT "output.weight" -#define TN_ATTN_NORM "blk.%d.attn_norm.weight" -#define TN_ATTN_Q "blk.%d.attn_q.weight" -#define TN_ATTN_K "blk.%d.attn_k.weight" -#define TN_ATTN_V "blk.%d.attn_v.weight" -#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" -#define TN_FFN_NORM "blk.%d.ffn_norm.weight" -#define TN_FFN_GATE "blk.%d.ffn_gate.weight" -#define TN_FFN_DOWN "blk.%d.ffn_down.weight" -#define TN_FFN_UP "blk.%d.ffn_up.weight" - -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) -#endif - -// -// logging -// -LLAMA_ATTRIBUTE_FORMAT(2, 3) -static void llama_log_internal (llama_log_level level, const char* format, ...); -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); - -#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) -#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) -#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) - -// -// helpers -// - -template -static std::string to_string(const T & val) { - std::stringstream ss; - ss << val; - return ss.str(); -} - -static void zeros(std::ofstream & file, size_t n) { - char zero = 0; - for (size_t i = 0; i < n; ++i) { - file.write(&zero, 1); - } -} - -LLAMA_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -// -// ggml helpers -// - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - -// -// llama helpers -// - -#ifdef GGML_USE_CUBLAS -# define llama_host_malloc(n) ggml_cuda_host_malloc(n) -# define llama_host_free(data) ggml_cuda_host_free(data) -#elif GGML_USE_METAL -# define llama_host_malloc(n) ggml_metal_host_malloc(n) -# define llama_host_free(data) ggml_metal_host_free(data) -#else -# define llama_host_malloc(n) malloc(n) -# define llama_host_free(data) free(data) -#endif - -#if defined(_WIN32) -static std::string llama_format_win_err(DWORD err) { - LPSTR buf; - size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); - if (!size) { - return "FormatMessageA failed"; - } - std::string ret(buf, size); - LocalFree(buf); - return ret; -} -#endif - -struct llama_buffer { - void * data = NULL; - size_t size = 0; - - // fallback to malloc / free - // useful in cases where CUDA can try to allocate PINNED memory - bool fallback = false; - - void resize(size_t n) { - llama_host_free(data); - - data = llama_host_malloc(n); - if (!data) { - fallback = true; - data = malloc(n); - } else { - fallback = false; - } - - GGML_ASSERT(data); - size = n; - } - - ~llama_buffer() { - if (data) { - if (fallback) { // NOLINT - free(data); - } else { - llama_host_free(data); - } - } - - data = NULL; - } -}; - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) const { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t len) const { - if (len == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, len, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); - } - if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); - } - } - - void write_raw(const void * ptr, size_t len) const { - if (len == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, len, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -struct llama_mmap { - void * addr; - size_t size; - - llama_mmap(const llama_mmap &) = delete; - -#ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { - size = file->size; - int fd = fileno(file->fp); - int flags = MAP_SHARED; - // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } -#ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } -#endif - addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { - throw std::runtime_error(format("mmap failed: %s", strerror(errno))); - } - - if (prefetch > 0) { - // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); - } - } - if (numa) { - // advise the kernel not to use readahead - // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", - strerror(errno)); - } - } - } - - ~llama_mmap() { - munmap(addr, size); - } -#elif defined(_WIN32) - static constexpr bool SUPPORTED = true; - - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) numa; - - size = file->size; - - HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); - - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - DWORD error = GetLastError(); - - if (hMapping == NULL) { - throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); - } - - addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - error = GetLastError(); - CloseHandle(hMapping); - - if (addr == NULL) { - throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); - } - - #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 - if (prefetch) { - // Advise the kernel to preload the mapped memory - WIN32_MEMORY_RANGE_ENTRY range; - range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; - if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - } - } - #else - #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") - #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 - } - - ~llama_mmap() { - if (!UnmapViewOfFile(addr)) { - fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - } - } -#else - static constexpr bool SUPPORTED = false; - - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) file; - (void) prefetch; - (void) numa; - - throw std::runtime_error(std::string("mmap not supported")); - } -#endif -}; - -// Represents some region of memory being locked using mlock or VirtualLock; -// will automatically unlock on destruction. -struct llama_mlock { - void * addr = NULL; - size_t size = 0; - - bool failed_already = false; - - llama_mlock() {} - llama_mlock(const llama_mlock &) = delete; - - ~llama_mlock() { - if (size) { - raw_unlock(addr, size); - } - } - - void init(void * ptr) { - GGML_ASSERT(addr == NULL && size == 0); // NOLINT - addr = ptr; - } - - void grow_to(size_t target_size) { - GGML_ASSERT(addr); - if (failed_already) { - return; - } - size_t granularity = lock_granularity(); - target_size = (target_size + granularity - 1) & ~(granularity - 1); - if (target_size > size) { - if (raw_lock((uint8_t *) addr + size, target_size - size)) { - size = target_size; - } else { - failed_already = true; - } - } - } - -#ifdef _POSIX_MEMLOCK_RANGE - static constexpr bool SUPPORTED = true; - - static size_t lock_granularity() { - return (size_t) sysconf(_SC_PAGESIZE); - } - - #ifdef __APPLE__ - #define MLOCK_SUGGESTION \ - "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ - "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" - #else - #define MLOCK_SUGGESTION \ - "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" - #endif - - bool raw_lock(const void * addr, size_t size) const { - if (!mlock(addr, size)) { - return true; - } - - char* errmsg = std::strerror(errno); - bool suggest = (errno == ENOMEM); - - // Check if the resource limit is fine after all - struct rlimit lock_limit; - if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) { - suggest = false; - } - if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) { - suggest = false; - } - - fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", - size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); - return false; - } - - #undef MLOCK_SUGGESTION - - static void raw_unlock(void * addr, size_t size) { - if (munlock(addr, size)) { - fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); - } - } -#elif defined(_WIN32) - static constexpr bool SUPPORTED = true; - - static size_t lock_granularity() { - SYSTEM_INFO si; - GetSystemInfo(&si); - return (size_t) si.dwPageSize; - } - - bool raw_lock(void * ptr, size_t len) const { - for (int tries = 1; ; tries++) { - if (VirtualLock(ptr, len)) { - return true; - } - if (tries == 2) { - fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", - len, size, llama_format_win_err(GetLastError()).c_str()); - return false; - } - - // It failed but this was only the first try; increase the working - // set size and try again. - SIZE_T min_ws_size, max_ws_size; - if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { - fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - return false; - } - // Per MSDN: "The maximum number of pages that a process can lock - // is equal to the number of pages in its minimum working set minus - // a small overhead." - // Hopefully a megabyte is enough overhead: - size_t increment = len + 1048576; - // The minimum must be <= the maximum, so we need to increase both: - min_ws_size += increment; - max_ws_size += increment; - if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { - fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - return false; - } - } - } - - static void raw_unlock(void * ptr, size_t len) { - if (!VirtualUnlock(ptr, len)) { - fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", - llama_format_win_err(GetLastError()).c_str()); - } - } -#else - static constexpr bool SUPPORTED = false; - - static size_t lock_granularity() { - return (size_t) 65536; - } - - bool raw_lock(const void * addr, size_t len) const { - fprintf(stderr, "warning: mlock not supported on this system\n"); - return false; - } - - static void raw_unlock(const void * addr, size_t len) {} -#endif -}; - -typedef void (*offload_func_t)(struct ggml_tensor * tensor); - -void llama_nop(struct ggml_tensor * tensor) { // don't offload by default - (void) tensor; -} - -// -// globals -// - -struct llama_state { - // We save the log callback globally - llama_log_callback log_callback = llama_log_callback_default; - void * log_callback_user_data = nullptr; -}; - -static llama_state g_state; - -// -// memory sizes (calculated for n_batch == 512) -// - -// computed for n_ctx == 2048 -// TODO: dynamically determine these sizes -// needs modifications in ggml - -// available llama models -enum e_model { - MODEL_UNKNOWN, - MODEL_3B, - MODEL_7B, - MODEL_13B, - MODEL_30B, - MODEL_65B, - MODEL_70B, -}; - -static const size_t kB = 1024; -static const size_t MB = 1024*1024; - -static const std::map & MEM_REQ_SCRATCH0(int n_ctx) -{ - static std::map k_sizes = { - { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB }, - { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB }, - { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB }, - { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB }, - { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess - { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB }, - }; - return k_sizes; -} - -static const std::map & MEM_REQ_SCRATCH1() -{ - static std::map k_sizes = { - { MODEL_3B, 128ull * MB }, - { MODEL_7B, 160ull * MB }, - { MODEL_13B, 192ull * MB }, - { MODEL_30B, 256ull * MB }, - { MODEL_65B, 384ull * MB }, // guess - { MODEL_70B, 304ull * MB }, - }; - return k_sizes; -} - -// used to store the compute graph tensors + non-scratch data -static const std::map & MEM_REQ_EVAL() -{ - static std::map k_sizes = { - { MODEL_3B, 8ull * MB }, - { MODEL_7B, 10ull * MB }, - { MODEL_13B, 12ull * MB }, - { MODEL_30B, 16ull * MB }, - { MODEL_65B, 24ull * MB }, // guess - { MODEL_70B, 24ull * MB }, - }; - return k_sizes; -} - -// amount of VRAM needed per batch size to hold temporary results -// the values for 3b are not derived from testing but instead chosen conservatively -static const std::map & VRAM_REQ_SCRATCH_BASE() -{ - static std::map k_sizes = { - { MODEL_3B, 512ull * kB }, - { MODEL_7B, 512ull * kB }, - { MODEL_13B, 640ull * kB }, - { MODEL_30B, 768ull * kB }, - { MODEL_65B, 1280ull * kB }, - { MODEL_70B, 1280ull * kB }, - }; - return k_sizes; -} - -// amount of VRAM needed per batch size and context to hold temporary results -// the values for 3b are not derived from testing but instead chosen conservatively -static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() -{ - static std::map k_sizes = { - { MODEL_3B, 128ull }, - { MODEL_7B, 128ull }, - { MODEL_13B, 160ull }, - { MODEL_30B, 208ull }, - { MODEL_65B, 256ull }, - { MODEL_70B, 256ull }, - }; - return k_sizes; -} - -// default hparams (LLaMA 7B) -struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; - uint32_t n_embd = 4096; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - uint32_t n_ff = 11008; - - float f_norm_rms_eps = 1e-5; - - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; - - enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; - - bool operator!=(const llama_hparams & other) const { - return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT - } - - uint32_t n_gqa() const { - return n_head/n_head_kv; - } - - uint32_t n_embd_head() const { - return n_embd/n_head; - } - - uint32_t n_embd_gqa() const { - return n_embd/n_gqa(); - } - - size_t kv_size() const { - size_t result = 2ull; - result *= (size_t) n_embd_gqa(); - result *= (size_t) n_ctx; - result *= (size_t) n_layer; - result *= sizeof(ggml_fp16_t); - return result; - } -}; - -struct llama_layer { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; -}; - -struct llama_kv_cache { - struct ggml_tensor * k = NULL; - struct ggml_tensor * v = NULL; - - struct ggml_context * ctx = NULL; - - llama_buffer buf; - - int n; // number of tokens currently in the cache - - ~llama_kv_cache() { - if (ctx) { - ggml_free(ctx); - } - -#ifdef GGML_USE_CUBLAS - ggml_cuda_free_data(k); - ggml_cuda_free_data(v); -#endif // GGML_USE_CUBLAS - } -}; - -struct llama_vocab { - // TODO: - // - add a vector of merges - // - add members for bos/eos/pad/sep tokens - // so that we can pass it to different types of tokenizers with a common interface - - using id = int32_t; - using token = std::string; - - struct token_score { - token tok; - float score; - }; - - std::unordered_map token_to_id; - std::vector id_to_token; -}; - -struct llama_model { - e_model type = MODEL_UNKNOWN; - - llama_hparams hparams; - llama_vocab vocab; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * output; - - std::vector layers; - int n_gpu_layers; - - // context - struct ggml_context * ctx = NULL; - - // the model memory buffer - llama_buffer buf; - - // model memory mapped file - std::unique_ptr mapping; - - // objects representing data potentially being locked in memory - llama_mlock mlock_buf; - llama_mlock mlock_mmap; - - // for quantize-stats only - std::vector> tensors_by_name; - - int64_t t_load_us = 0; - int64_t t_start_us = 0; - - ~llama_model() { - if (ctx) { - ggml_free(ctx); - } - -#ifdef GGML_USE_CUBLAS - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cuda_free_data(tensors_by_name[i].second); - } - ggml_cuda_free_scratch(); -#elif defined(GGML_USE_CLBLAST) - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cl_free_data(tensors_by_name[i].second); - } -#endif - } -}; - -struct llama_context { - llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} - ~llama_context() { - if (model_owner) { - delete &model; - } -#ifdef GGML_USE_METAL - if (ctx_metal) { - ggml_metal_free(ctx_metal); - } -#endif -#ifdef LLAMA_USE_ALLOCATOR - if (alloc) { - ggml_allocr_free(alloc); - } -#endif - } - - std::mt19937 rng; - - bool has_evaluated_once = false; - - int64_t t_sample_us = 0; - int64_t t_eval_us = 0; - int64_t t_p_eval_us = 0; - - int32_t n_sample = 0; // number of tokens sampled - int32_t n_eval = 0; // number of eval calls - int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - - const llama_model & model; - - bool model_owner = false; - - int64_t t_load_us; - int64_t t_start_us; - - // key + value cache for the self attention - struct llama_kv_cache kv_self; - - size_t mem_per_token = 0; - - // decode output (2-dimensional array: [n_tokens][n_vocab]) - std::vector logits; - bool logits_all = false; - - // input embedding (1-dimensional array: [n_embd]) - std::vector embedding; - - // reusable buffer for `struct ggml_graph_plan.work_data` - std::vector work_buffer; - - // memory buffers used to evaluate the model - // TODO: move in llama_state - llama_buffer buf_compute; - -#ifdef LLAMA_USE_ALLOCATOR - llama_buffer buf_alloc; - ggml_allocr * alloc = NULL; -#endif - -#ifdef LLAMA_USE_SCRATCH - llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; - - int buf_last = 0; - size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; -#endif - -#ifdef GGML_USE_METAL - ggml_metal_context * ctx_metal = NULL; -#endif - -#ifdef GGML_USE_MPI - ggml_mpi_context * ctx_mpi = NULL; -#endif - - static void use_buf(struct ggml_context * ctx, int i) { -#if defined(LLAMA_USE_SCRATCH) - size_t last_size = 0; - - if (i == -1) { - last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); - } else { - auto & buf = buf_scratch[i]; - last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.data, }); - } - - if (buf_last >= 0) { - buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); - } - - buf_last = i; -#else - (void) i; - (void) ctx; -#endif - } - - static size_t get_buf_max_mem(int i) { -#if defined(LLAMA_USE_SCRATCH) - return buf_max_size[i]; -#else - (void) i; - return 0; -#endif - } -}; - -// -// kv cache helpers -// - -static bool llama_kv_cache_init( - const struct llama_hparams & hparams, - struct llama_kv_cache & cache, - ggml_type wtype, - int n_ctx, - int n_gpu_layers) { - const int n_embd = hparams.n_embd_gqa(); - const int n_layer = hparams.n_layer; - - const int64_t n_mem = n_layer*n_ctx; - const int64_t n_elements = n_embd*n_mem; - - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); - cache.n = 0; - - struct ggml_init_params params; - params.mem_size = cache.buf.size; - params.mem_buffer = cache.buf.data; - params.no_alloc = false; - - cache.ctx = ggml_init(params); - - if (!cache.ctx) { - LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); - return false; - } - - cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - ggml_set_name(cache.k, "cache_k"); - ggml_set_name(cache.v, "cache_v"); - - (void) n_gpu_layers; -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer + 1) { - ggml_cuda_assign_buffers_no_scratch(cache.v); - } - if (n_gpu_layers > n_layer + 2) { - ggml_cuda_assign_buffers_no_scratch(cache.k); - } -#endif // GGML_USE_CUBLAS - - return true; -} - -// -// model loading and saving -// - -enum llama_file_version { - GGUF_FILE_VERSION_V1 = 1, -}; - -static const char * llama_file_version_name(llama_file_version version) { - switch (version) { - case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)"; - } - - return "unknown"; -} - -static std::string llama_format_tensor_shape(const std::vector & ne) { - char buf[256]; - snprintf(buf, sizeof(buf), "%5u", ne.at(0)); - for (size_t i = 1; i < ne.size(); i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5u", ne.at(i)); - } - return buf; -} - -static std::string llama_format_tensor_shape(const struct ggml_tensor * t) { - char buf[256]; - snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]); - } - return buf; -} - -struct llama_model_loader { - int n_kv = 0; - int n_tensors = 0; - int n_created = 0; - - bool use_mmap = false; - - llama_file file; - llama_file_version file_version; - - std::unique_ptr mapping; - - struct gguf_context * ctx_gguf = NULL; - struct ggml_context * ctx_meta = NULL; - - llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, - }; - - ctx_gguf = gguf_init_from_file(fname.c_str(), params); - - n_kv = gguf_get_n_kv(ctx_gguf); - n_tensors = gguf_get_n_tensors(ctx_gguf); - - file_version = (enum llama_file_version) gguf_get_version(ctx_gguf); - - // print meta data - // TODO: make optional - { - LLAMA_LOG_INFO("%s: loaded meta data with %d key-value paris and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(file_version)); - - for (int i = 0; i < n_tensors; i++) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name); - - LLAMA_LOG_INFO("%s: - tensor %3d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); - } - - for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(ctx_gguf, i); - const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); - - LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type)); - } - } - - if (!llama_mmap::SUPPORTED) { - LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); - use_mmap = false; - } - - this->use_mmap = use_mmap; - } - - const char * get_tensor_name(int i) const { - return gguf_get_tensor_name(ctx_gguf, i); - } - - struct ggml_tensor * get_tensor_meta(int i) const { - return ggml_get_tensor(ctx_meta, get_tensor_name(i)); - } - - void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const { - ctx_size_p = 0; - mmapped_size_p = 0; - - for (int i = 0; i < n_tensors; i++) { - struct ggml_tensor * meta = get_tensor_meta(i); - ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; - (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta); - } - } - - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) { - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ctx, true); - } - - struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); - tensor->backend = backend; // TODO: ggml_set_backend - ggml_set_name(tensor, ggml_get_name(meta)); - - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ctx, use_mmap); - } - - n_created++; - - return tensor; - } - - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend backend) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); - - { - bool is_ok = true; - for (size_t i = 0; i < ne.size(); ++i) { - if (ne[i] != cur->ne[i]) { - is_ok = false; - break; - } - } - if (!is_ok) { - throw std::runtime_error( - format("%s: tensor '%s' has wrong shape; expected %s, got %s", - __func__, name.c_str(), - llama_format_tensor_shape(ne).c_str(), - llama_format_tensor_shape(cur).c_str())); - } - } - - return create_tensor_for(ctx, cur, backend); - } - - void done_getting_tensors() const { - if (n_created != n_tensors) { - throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); - } - } - - size_t file_offset(const char * name) const { - const int idx = gguf_find_tensor(ctx_gguf, name); - - if (idx < 0) { - throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name)); - } - - return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); - } - - void load_data_for(struct ggml_tensor * cur) const { - const size_t offs = file_offset(ggml_get_name(cur)); - - if (use_mmap) { - cur->data = (uint8_t *) mapping->addr + offs; - } else { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); - } - } - - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { - size_t size_data = 0; - size_t size_lock = 0; - size_t size_pref = 0; // prefetch - - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); - if (cur->backend == GGML_BACKEND_CPU) { - size_pref += ggml_nbytes(cur); - } - } - - if (use_mmap) { - mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa())); - if (lmlock) { - lmlock->init(mapping->addr); - } - } - - size_t done_size = 0; - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - GGML_ASSERT(cur); // unused tensors should have been caught by load_data already - - if (progress_callback) { - progress_callback((float) done_size / size_data, progress_callback_user_data); - } - - // allocate temp buffer if not using mmap - if (!use_mmap && cur->data == NULL) { - GGML_ASSERT(cur->backend != GGML_BACKEND_CPU); - cur->data = malloc(ggml_nbytes(cur)); - } - - load_data_for(cur); - - switch (cur->backend) { - case GGML_BACKEND_CPU: - if (use_mmap && lmlock) { - size_lock += ggml_nbytes(cur); - lmlock->grow_to(size_lock); - } - break; -#if defined(GGML_USE_CUBLAS) - case GGML_BACKEND_GPU: - case GGML_BACKEND_GPU_SPLIT: - // old code: - //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); - - // TODO: test if this works !! - ggml_cuda_transform_tensor(cur->data, cur); - if (!use_mmap) { - free(cur->data); - } - break; -#elif defined(GGML_USE_CLBLAST) - case GGML_BACKEND_GPU: - ggml_cl_transform_tensor(cur->data, cur); - if (!use_mmap) { - free(cur->data); - } - break; -#endif - default: - continue; - } - - done_size += ggml_nbytes(cur); - } - } -}; - -// -// load LLaMA models -// - -static const char * llama_ftype_name(enum llama_ftype ftype) { - switch (ftype) { - case LLAMA_FTYPE_ALL_F32: return "all F32"; - case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; - case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; - case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; - case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: - return "mostly Q4_1, some F16"; - case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; - case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; - case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; - - // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; - case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; - case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; - - default: return "unknown, may not work"; - } -} - -static const char * llama_model_type_name(e_model type) { - switch (type) { - case MODEL_3B: return "3B"; - case MODEL_7B: return "7B"; - case MODEL_13B: return "13B"; - case MODEL_30B: return "30B"; - case MODEL_65B: return "65B"; - case MODEL_70B: return "70B"; - default: GGML_ASSERT(false); - } -} - -static void llama_model_load_internal( - const std::string & fname, - llama_model & model, - llama_vocab & vocab, - int n_ctx, - int n_batch, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - const bool mul_mat_q, - float rope_freq_base, - float rope_freq_scale, - bool low_vram, - ggml_type memory_type, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); - - std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); - - model.n_gpu_layers = n_gpu_layers; - - auto & hparams = model.hparams; - - // read hparams - { - struct gguf_context * ctx = ml->ctx_gguf; - -#define GGUF_GET(dst, func, type, req, key) \ - { \ - const int kid = gguf_find_key(ctx, key); \ - if (kid >= 0) { \ - enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - throw std::runtime_error(format("key %s has wrong type: %s", key, gguf_type_name(ktype))); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - throw std::runtime_error(format("key not found in model: %s", key)); \ - } \ - } - - GGUF_GET(hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens"); - GGUF_GET(hparams.n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length"); - GGUF_GET(hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length"); - GGUF_GET(hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length"); - GGUF_GET(hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count"); - GGUF_GET(hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count"); - GGUF_GET(hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count"); - GGUF_GET(hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon"); - - // n_head_kv is optional, default to n_head - hparams.n_head_kv = hparams.n_head; - GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv"); -#undef GGUF_GET - - switch (hparams.n_layer) { - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; - default: - { - if (hparams.n_layer < 32) { - model.type = e_model::MODEL_7B; - } - } break; - } - - hparams.n_ctx = n_ctx; - - // LLaMAv2 - // TODO: probably not needed - { - const auto n_gqa = hparams.n_gqa(); - - if (model.type == e_model::MODEL_65B && n_gqa == 8) { - LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa); - model.type = e_model::MODEL_70B; - } - } - - hparams.rope_freq_base = rope_freq_base; - hparams.rope_freq_scale = rope_freq_scale; - } - - // read vocab - { - struct gguf_context * ctx = ml->ctx_gguf; - - vocab.id_to_token.resize(hparams.n_vocab); - - const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens"); - if (token_idx == -1) { - throw std::runtime_error("cannot find token list in GGUF file\n"); - } - - const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores"); - if (score_idx == -1) { - throw std::runtime_error("cannot find token scores list in GGUF file\n"); - } - - const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - - for (uint32_t i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); - - vocab.token_to_id[word] = i; - - auto & tok_score = vocab.id_to_token[i]; - tok_score.tok = std::move(word); - tok_score.score = scores[i]; - } - } - - { - LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml->file_version)); - LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); - LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); - LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); - LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type)); - } - - if (vocab_only) { - return; - } - - auto & ctx = model.ctx; - - size_t ctx_size; - size_t mmapped_size; - - ml->calc_sizes(ctx_size, mmapped_size); - - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); - - // create the ggml context - { - model.buf.resize(ctx_size); - if (use_mlock) { - model.mlock_buf.init (model.buf.data); - model.mlock_buf.grow_to(model.buf.size); - } - - struct ggml_init_params params = { - /*.mem_size =*/ model.buf.size, - /*.mem_buffer =*/ model.buf.data, - /*.no_alloc =*/ ml->use_mmap, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - throw std::runtime_error(format("ggml_init() failed")); - } - } - - (void) main_gpu; - (void) mul_mat_q; -#if defined(GGML_USE_CUBLAS) - LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); - ggml_cuda_set_mul_mat_q(mul_mat_q); -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT -#elif defined(GGML_USE_CLBLAST) - LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU -#else -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU -#endif - - // prepare memory for the weights - size_t vram_weights = 0; - size_t vram_scratch = 0; - { - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_embd_gqa = hparams.n_embd_gqa(); - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; - - model.tok_embeddings = ml->create_tensor(ctx, TN_TOKEN_EMBD, {n_embd, n_vocab}, GGML_BACKEND_CPU); - - // "output" tensor - { - ggml_backend backend_norm; - ggml_backend backend_output; - if (n_gpu_layers > int(n_layer)) { // NOLINT - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU -#ifndef _WIN32 - backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; -#else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; -#endif // _WIN32 - - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.norm = ml->create_tensor(ctx, TN_OUTPUT_NORM, {n_embd}, backend_norm); - model.output = ml->create_tensor(ctx, TN_OUTPUT, {n_embd, n_vocab}, backend_output); - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } - } - - const uint32_t n_ff = hparams.n_ff; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT - - auto & layer = model.layers[i]; - layer.attention_norm = ml->create_tensor(ctx, format(TN_ATTN_NORM, i), {n_embd}, backend); - - layer.wq = ml->create_tensor(ctx, format(TN_ATTN_Q, i), {n_embd, n_embd}, backend_split); - layer.wk = ml->create_tensor(ctx, format(TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml->create_tensor(ctx, format(TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml->create_tensor(ctx, format(TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split); - - layer.ffn_norm = ml->create_tensor(ctx, format(TN_FFN_NORM, i), {n_embd}, backend); - - layer.w1 = ml->create_tensor(ctx, format(TN_FFN_GATE, i), {n_embd, n_ff}, backend_split); - layer.w2 = ml->create_tensor(ctx, format(TN_FFN_DOWN, i), { n_ff, n_embd}, backend_split); - layer.w3 = ml->create_tensor(ctx, format(TN_FFN_UP, i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); - } - } - } - - ml->done_getting_tensors(); - - // print memory requirements - { - const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; - - // this is the total memory required to run the inference - size_t mem_required = - ctx_size + - mmapped_size - vram_weights; // weights in VRAM not in memory - -#ifndef LLAMA_USE_ALLOCATOR - mem_required += - MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + - MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at(model.type); -#endif - - // this is the memory required by one llama_state - const size_t mem_required_state = - scale*hparams.kv_size(); - - LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, - mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - - (void) vram_scratch; - (void) n_batch; -#ifdef GGML_USE_CUBLAS - if (low_vram) { - LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); - ggml_cuda_set_scratch_size(0); // disable scratch - } else { - const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type); - const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type); - vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context); - ggml_cuda_set_scratch_size(vram_scratch); - if (n_gpu_layers > 0) { - LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n", - __func__, vram_scratch_base / kB, vram_scratch_per_context, - (vram_scratch + MB - 1) / MB); // round up - } - } -#endif // GGML_USE_CUBLAS - -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - - LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); - if (n_gpu_layers > (int) hparams.n_layer) { - LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); - } - size_t vram_kv_cache = 0; - -#ifdef GGML_USE_CUBLAS - const int max_backend_supported_layers = hparams.n_layer + 3; - const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; - if (n_gpu_layers > (int) hparams.n_layer + 1) { - if (low_vram) { - LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); - } else { - LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += hparams.kv_size() / 2; - } - } - if (n_gpu_layers > (int) hparams.n_layer + 2) { - if (low_vram) { - LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); - } else { - LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += hparams.kv_size() / 2; - } - } -#elif defined(GGML_USE_CLBLAST) - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; -#endif // GGML_USE_CUBLAS - - LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", - __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n", - __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up -#else - (void) n_gpu_layers; -#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) - } - - // populate `tensors_by_name` - for (int i = 0; i < ml->n_tensors; ++i) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, ml->get_tensor_name(i)); - model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); - } - - (void) tensor_split; -#if defined(GGML_USE_CUBLAS) - { - ggml_cuda_set_tensor_split(tensor_split); - } -#endif - - ml->load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); - - if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); - } - - model.mapping = std::move(ml->mapping); - - // loading time will be recalculate after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = ggml_time_us() - model.t_start_us; -} - -static bool llama_model_load( - const std::string & fname, - llama_model & model, - llama_vocab & vocab, - int n_ctx, - int n_batch, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - const bool mul_mat_q, - float rope_freq_base, - float rope_freq_scale, - bool low_vram, - ggml_type memory_type, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void *progress_callback_user_data) { - try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, - main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, - use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); - return true; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); - return false; - } -} - -static struct ggml_cgraph * llama_build_graph( - llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { - - GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT - - const int N = n_tokens; - - const auto & model = lctx.model; - const auto & hparams = model.hparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - GGML_ASSERT(n_embd_head == hparams.n_rot); - - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int n_gpu_layers = model.n_gpu_layers; - - auto & mem_per_token = lctx.mem_per_token; - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, - }; - -#ifdef LLAMA_USE_ALLOCATOR - params.no_alloc = true; -#endif - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - if (tokens) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - -#ifdef LLAMA_USE_ALLOCATOR - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); - } -#else - memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); -#endif - ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); - -#ifdef LLAMA_USE_ALLOCATOR - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); - } -#else - memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); -#endif - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - // - // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal - // in that case ggml_cuda_assign_buffers has no effect - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers; - } -#endif // GGML_USE_CUBLAS - - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); -#ifdef LLAMA_USE_ALLOCATOR - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } -#else - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); -#endif - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - - for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers; - } -#endif // GGML_USE_CUBLAS - - struct ggml_tensor * inpSA = inpL; - - llama_context::use_buf(ctx0, 0); - - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); - - // cur = cur*attention_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); - } - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); - - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); - - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); - - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa), - n_embd_head, n_head_kv, n_past + N), - 0, 2, 1, 3); - offload_func_kq(K); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, - n_ctx*ggml_element_size(kv_self.v), - n_ctx*ggml_element_size(kv_self.v)*n_embd_head, - n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); - } - - llama_context::use_buf(ctx0, 1); - - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); - - // feed-forward network - { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); - - // SILU activation - cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); - - cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); - } - - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); - - // input for next layer - inpL = cur; - } - - llama_context::use_buf(ctx0, 0); - - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); - } - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); - - llama_context::use_buf(ctx0, -1); - - // logits -> probs - //cur = ggml_soft_max_inplace(ctx0, cur); - - ggml_build_forward_expand(gf, cur); - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - -#if 0 - LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, - ggml_used_mem(ctx0)/1024.0/1024.0, - lctx.get_buf_max_mem(0)/1024.0/1024.0, - lctx.get_buf_max_mem(1)/1024.0/1024.0, - lctx.work_buffer.size()/1024.0/1024.0, - n_past, N); -#endif - - ggml_free(ctx0); - - return gf; -} - -// evaluate the transformer -// -// - lctx: llama context -// - tokens: new batch of tokens to process -// - embd embeddings input -// - n_tokens number of tokens -// - n_past: the context size so far -// - n_threads: number of threads to use -// -static bool llama_eval_internal( - llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past, - int n_threads, - const char * cgraph_fname) { - - GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT - - const int64_t t_start_us = ggml_time_us(); - -#ifdef GGML_USE_MPI - ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); -#endif - - const int N = n_tokens; - - const auto & model = lctx.model; - const auto & hparams = model.hparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_vocab = hparams.n_vocab; - -#ifdef LLAMA_USE_ALLOCATOR - ggml_allocr_reset(lctx.alloc); -#endif - - ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); - -#ifdef LLAMA_USE_ALLOCATOR - ggml_allocr_alloc_graph(lctx.alloc, gf); -#endif - - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; - - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - - GGML_ASSERT(strcmp(res->name, "result_output") == 0); - GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); - -#if GGML_USE_MPI - const int64_t n_layer = hparams.n_layer; - ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); -#endif - -#ifdef GGML_USE_METAL - if (lctx.ctx_metal && N == 1) { - // TODO: disabled until #2413 is resolved - //if (!ggml_metal_if_optimized(lctx.ctx_metal)) { - // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf); - //} - ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); - ggml_metal_graph_compute(lctx.ctx_metal, gf); - ggml_metal_get_tensor (lctx.ctx_metal, res); - if (!lctx.embedding.empty()) { - ggml_metal_get_tensor(lctx.ctx_metal, embeddings); - } - } else { - // IMPORTANT: - // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla - // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX - // coprocessor. - // - // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch. - // But for now, we have focused only on Matrix x Vector Metal multiplication. - // - // TODO: avoid these syncs via shared memory (ref #1696) - // - if (lctx.ctx_metal) { - // We need to sync the GPU KV cache with the CPU KV cache - ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k); - ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); - } - - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); - } -#else - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); -#endif - -#if GGML_USE_MPI - ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); -#endif - - // update kv token count - lctx.kv_self.n = n_past + N; - - if (cgraph_fname) { - ggml_graph_export(gf, cgraph_fname); - } - -#ifdef GGML_PERF - // print timing information per ggml operation (for debugging purposes) - // requires GGML_PERF to be defined - ggml_graph_print(gf); -#endif - - // plot the computation graph in dot format (for debugging purposes) - //if (n_past%100 == 0) { - // ggml_graph_dump_dot(gf, NULL, "llama.dot"); - //} - - // extract logits - { - auto & logits_out = lctx.logits; - - if (lctx.logits_all) { - logits_out.resize(n_vocab * N); - memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N); - } else { - // return result for just the last token - logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - } - } - - // extract embeddings - if (!lctx.embedding.empty()) { - auto & embedding_out = lctx.embedding; - - embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); - } - - // measure the performance only for the single-token evals - if (N == 1) { - lctx.t_eval_us += ggml_time_us() - t_start_us; - lctx.n_eval++; - } - else if (N > 1) { - lctx.t_p_eval_us += ggml_time_us() - t_start_us; - lctx.n_p_eval += N; - } - - return true; -} - -// -// tokenizer -// - -static std::string llama_vocab_type(const llama_vocab & vocab) { - return vocab.token_to_id.size() == 32000 ? "spm": "bpe"; -} - -static bool llama_is_normal_token(const llama_vocab & vocab, llama_token token) { - if (llama_vocab_type(vocab) == "spm") { - return token >= 259; - } - - if (llama_vocab_type(vocab) == "bpe") { - return token >= 95; - } - - return false; -} - -static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token token) { - if (llama_vocab_type(vocab) == "spm") { - return token == 0; - } - - // TODO: improve? - return false; -} - -static bool llama_is_control_token(const llama_vocab & vocab, llama_token token) { - if (llama_vocab_type(vocab) == "spm") { - return token == 1 || token == 2; - } - - // TODO: improve? - return false; -} - -static bool llama_is_bos_token(const llama_vocab & vocab, llama_token token) { - if (llama_vocab_type(vocab) == "spm") { - return token == 1; - } - - // TODO: improve? - return false; -} - -static bool llama_is_eos_token(const llama_vocab & vocab, llama_token token) { - if (llama_vocab_type(vocab) == "spm") { - return token == 2; - } - - // TODO: improve? - return false; -} - -static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token token) { - GGML_UNUSED(vocab); - GGML_UNUSED(token); - // TODO: improve? - return false; -} - -static bool llama_is_unused_token(const llama_vocab & vocab, llama_token token) { - GGML_UNUSED(vocab); - GGML_UNUSED(token); - // TODO: improve? - return false; -} - -static bool llama_is_byte_token(const llama_vocab & vocab, llama_token token) { - if (llama_vocab_type(vocab) == "spm") { - return 3 <= token && token < 259; - } - - if (llama_vocab_type(vocab) == "bpe") { - return 1 <= token && token < 95; - } - - return false; -} - -static uint8_t llama_byte_to_char(const llama_vocab & vocab, uint8_t byte) { - if (llama_vocab_type(vocab) == "spm") { - return byte + 3; - } - - if (llama_vocab_type(vocab) == "bpe") { - return byte + 32; - } - - return false; -} - -static std::string llama_escape_whitespace(const std::string& text) { - std::string result; - bool escaping = false; - result += "\xe2\x96\x81"; - for (size_t offs = 0; offs < text.length(); ++offs) { - if (text[offs] == ' ') { - if (!escaping) { - result += "\xe2\x96\x81"; - escaping = true; - } - } - else { - escaping = false; - result += text[offs]; - } - } - return result; -} - -static std::string llama_unescape_whitespace(const std::string& word) { - if (word.length() >= 3 && word.substr(0, 3) == "\xe2\x96\x81") { - return std::string(" ") + word.substr(3); - } - return word; -} - -static size_t utf8_len(char src) { - const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - uint8_t highbits = static_cast(src) >> 4; - return lookup[highbits]; -} - -struct llama_sp_symbol { - using index = int; - index prev; - index next; - const char * text; - size_t n; -}; - -static_assert(std::is_trivially_copyable::value, "llama_sp_symbol is not trivially copyable"); - -struct llama_sp_bigram { - struct comparator { - bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { - return (l.score < r.score) || (l.score == r.score && l.left > r.left); - } - }; - using queue_storage = std::vector; - using queue = std::priority_queue; - llama_sp_symbol::index left; - llama_sp_symbol::index right; - float score; - size_t size; -}; - -// original implementation: -// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 -struct llama_tokenizer { - llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} - - void tokenize(const std::string & text, std::vector & output) { - // split string into utf8 chars - int index = 0; - size_t offs = 0; - while (offs < text.size()) { - llama_sp_symbol sym; - size_t len = utf8_len(text[offs]); - GGML_ASSERT(offs + len <= text.size()); - sym.text = text.c_str() + offs; - sym.n = len; - offs += len; - sym.prev = index - 1; - sym.next = offs == text.size() ? -1 : index + 1; - index++; - symbols_.emplace_back(sym); - } - - // seed the work queue with all possible 2-character tokens. - for (size_t i = 1; i < symbols_.size(); ++i) { - try_add_bigram(i - 1, i); - } - - // keep substituting the highest frequency pairs for as long as we can. - while (!work_queue_.empty()) { - auto bigram = work_queue_.top(); - work_queue_.pop(); - - auto & left_sym = symbols_[bigram.left]; - auto & right_sym = symbols_[bigram.right]; - - // if one of the symbols already got merged, skip it. - if (left_sym.n == 0 || right_sym.n == 0 || - left_sym.n + right_sym.n != bigram.size) { - continue; - } - - // merge the right sym into the left one - left_sym.n += right_sym.n; - right_sym.n = 0; - - //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); - - // remove the right sym from the chain - left_sym.next = right_sym.next; - if (right_sym.next >= 0) { - symbols_[right_sym.next].prev = bigram.left; - } - - // find more substitutions - try_add_bigram(left_sym.prev, bigram.left); - try_add_bigram(bigram.left, left_sym.next); - } - - for (int i = 0; i != -1; i = symbols_[i].next) { - auto & symbol = symbols_[i]; - resegment(symbol, output); - } - } - -private: - void resegment(llama_sp_symbol &symbol, std::vector &output) { - auto text = std::string(symbol.text, symbol.n); - auto token = vocab_.token_to_id.find(text); - - // Do we need to support is_unused? - if (token != vocab_.token_to_id.end()) { - output.push_back((*token).second); - return; - } - - const auto p = rev_merge.find(text); - - if (p == rev_merge.end()) { - // output any symbols that did not form tokens as bytes. - for (int j = 0; j < (int)symbol.n; ++j) { - llama_vocab::id token_id = llama_byte_to_char(vocab_, symbol.text[j]); - output.push_back(token_id); - } - return; - } - - resegment(symbols_[p->second.first], output); - resegment(symbols_[p->second.second], output); - } - - void try_add_bigram(int left, int right) { - if (left == -1 || right == -1) { - return; - } - - const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n); - auto token = vocab_.token_to_id.find(text); - - if (token == vocab_.token_to_id.end()) { - return; - } - - if (static_cast((*token).second) >= vocab_.id_to_token.size()) { - return; - } - - const auto &tok_score = vocab_.id_to_token[(*token).second]; - - llama_sp_bigram bigram; - bigram.left = left; - bigram.right = right; - bigram.score = tok_score.score; - bigram.size = text.size(); - work_queue_.push(bigram); - - // Do we need to support is_unused? - rev_merge[text] = std::make_pair(left, right); - } - - const llama_vocab & vocab_; - std::vector symbols_; - llama_sp_bigram::queue work_queue_; - std::map > rev_merge; -}; - -static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) { - llama_tokenizer tokenizer(vocab); - std::vector output; - - if (raw_text.empty()) { - return output; - } - - if (bos) { - output.push_back(llama_token_bos()); - } - - std::string text; - if (escape) { - text = llama_escape_whitespace(raw_text); - } else { - text = raw_text; - } - - tokenizer.tokenize(text, output); - return output; -} - -// -// grammar - internal -// - -struct llama_grammar { - const std::vector> rules; - std::vector> stacks; -}; - -struct llama_grammar_candidate { - size_t index; - const uint32_t * code_points; -}; - -// NOTE: assumes valid utf8 (but checks for overrun) -// adds a terminating 0 for use as pointer -std::vector decode_utf8(const char * src) { - static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - const char * pos = src; - std::vector code_points; - while (*pos != 0) { - uint8_t first_byte = static_cast(*pos); - uint8_t highbits = first_byte >> 4; - int len = lookup[highbits]; - uint8_t mask = (1 << (8 - len)) - 1; - uint32_t value = first_byte & mask; - const char * end = pos + len; // may overrun! - ++pos; - for ( ; pos < end && *pos != 0; ++pos) { - value = (value << 6) + (static_cast(*pos) & 0x3F); - } - code_points.push_back(value); - } - code_points.push_back(0); - return code_points; -} - -// returns true iff pos points to the end of one of the definitions of a rule -static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) { - switch (pos->type) { - case LLAMA_GRETYPE_END: return true; // NOLINT - case LLAMA_GRETYPE_ALT: return true; // NOLINT - default: return false; - } -} - -// returns true iff chr satisfies the char range at pos (regular or inverse range) -// asserts that pos is pointing to a char range element -static std::pair llama_grammar_match_char( - const llama_grammar_element * pos, - const uint32_t chr) { - - bool found = false; - bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; - GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT - - do { - if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) { - // inclusive range, e.g. [a-z] - found = found || (pos->value <= chr && chr <= pos[1].value); - pos += 2; - } else { - // exact char match, e.g. [a] or "a" - found = found || pos->value == chr; - pos += 1; - } - } while (pos->type == LLAMA_GRETYPE_CHAR_ALT); - - return std::make_pair(found == is_positive_char, pos); -} - -// transforms a grammar pushdown stack into N possible stacks, all ending -// at a character range (terminal element) -static void llama_grammar_advance_stack( - const std::vector> & rules, - const std::vector & stack, - std::vector> & new_stacks) { - - if (stack.empty()) { - new_stacks.push_back(stack); - return; - } - - const llama_grammar_element * pos = stack.back(); - - switch (pos->type) { - case LLAMA_GRETYPE_RULE_REF: { - const size_t rule_id = static_cast(pos->value); - const llama_grammar_element * subpos = rules[rule_id].data(); - do { - // init new stack without the top (pos) - std::vector new_stack(stack.begin(), stack.end() - 1); - if (!llama_grammar_is_end_of_sequence(pos + 1)) { - // if this rule ref is followed by another element, add that to stack - new_stack.push_back(pos + 1); - } - if (!llama_grammar_is_end_of_sequence(subpos)) { - // if alternate is nonempty, add to stack - new_stack.push_back(subpos); - } - llama_grammar_advance_stack(rules, new_stack, new_stacks); - while (!llama_grammar_is_end_of_sequence(subpos)) { - // scan to end of alternate def - subpos++; - } - if (subpos->type == LLAMA_GRETYPE_ALT) { - // there's another alternate def of this rule to process - subpos++; - } else { - break; - } - } while (true); - break; - } - case LLAMA_GRETYPE_CHAR: - case LLAMA_GRETYPE_CHAR_NOT: - new_stacks.push_back(stack); - break; - default: - // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range - // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on - // those - GGML_ASSERT(false); - } -} - -// takes a set of possible pushdown stacks on a grammar, which are required to -// be positioned at a character range (see `llama_grammar_advance_stack`), and -// produces the N possible stacks if the given char is accepted at those -// positions -static std::vector> llama_grammar_accept( - const std::vector> & rules, - const std::vector> & stacks, - const uint32_t chr) { - - std::vector> new_stacks; - - for (const auto & stack : stacks) { - if (stack.empty()) { - continue; - } - - auto match = llama_grammar_match_char(stack.back(), chr); - if (match.first) { - const llama_grammar_element * pos = match.second; - - // update top of stack to next element, if any - std::vector new_stack(stack.begin(), stack.end() - 1); - if (!llama_grammar_is_end_of_sequence(pos)) { - new_stack.push_back(pos); - } - llama_grammar_advance_stack(rules, new_stack, new_stacks); - } - } - - return new_stacks; -} - -static std::vector llama_grammar_reject_candidates( - const std::vector> & rules, - const std::vector> & stacks, - const std::vector & candidates); - -static std::vector llama_grammar_reject_candidates_for_stack( - const std::vector> & rules, - const std::vector & stack, - const std::vector & candidates) { - - std::vector rejects; - - if (stack.empty()) { - // accept nothing; EOS is handled elsewhere - rejects.insert(rejects.end(), candidates.begin(), candidates.end()); - return rejects; - } - - const llama_grammar_element * stack_pos = stack.back(); - - std::vector next_candidates; - for (auto tok : candidates) { - if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) { - if (tok.code_points[1] != 0) { - next_candidates.push_back({ tok.index, tok.code_points + 1 }); - } - } else { - rejects.push_back(tok); - } - } - - const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second; - - // update top of stack to next element, if any - std::vector stack_after(stack.begin(), stack.end() - 1); - if (!llama_grammar_is_end_of_sequence(stack_pos_after)) { - stack_after.push_back(stack_pos_after); - } - std::vector> next_stacks; - llama_grammar_advance_stack(rules, stack_after, next_stacks); - - auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates); - for (auto tok : next_rejects) { - rejects.push_back({ tok.index, tok.code_points - 1 }); - } - - return rejects; -} - -static std::vector llama_grammar_reject_candidates( - const std::vector> & rules, - const std::vector> & stacks, - const std::vector & candidates) { - GGML_ASSERT(!stacks.empty()); // REVIEW - - if (candidates.empty()) { - return std::vector(); - } - - auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates); - - for (size_t i = 1, size = stacks.size(); i < size; ++i) { - rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects); - } - return rejects; -} - -// -// grammar - external -// - -struct llama_grammar * llama_grammar_init( - const llama_grammar_element ** rules, - size_t n_rules, - size_t start_rule_index) { - const llama_grammar_element * pos; - - // copy rule definitions into vectors - std::vector> vec_rules(n_rules); - for (size_t i = 0; i < n_rules; i++) { - for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) { - vec_rules[i].push_back(*pos); - } - vec_rules[i].push_back({LLAMA_GRETYPE_END, 0}); - } - - // loop over alternates of start rule to build initial stacks - std::vector> stacks; - pos = rules[start_rule_index]; - do { - std::vector stack; - if (!llama_grammar_is_end_of_sequence(pos)) { - // if alternate is nonempty, add to stack - stack.push_back(pos); - } - llama_grammar_advance_stack(vec_rules, stack, stacks); - while (!llama_grammar_is_end_of_sequence(pos)) { - // scan to end of alternate def - pos++; - } - if (pos->type == LLAMA_GRETYPE_ALT) { - // there's another alternate def of this rule to process - pos++; - } else { - break; - } - } while (true); - - return new llama_grammar{ std::move(vec_rules), std::move(stacks) }; -} - -void llama_grammar_free(struct llama_grammar * grammar) { - delete grammar; -} - -// -// sampling -// - -void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) { - assert(candidates->size > 0); - - const int64_t t_start_sample_us = ggml_time_us(); - - // Sort the logits in descending order - if (!candidates->sorted) { - std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }); - candidates->sorted = true; - } - - float max_l = candidates->data[0].logit; - float cum_sum = 0.0f; - for (size_t i = 0; i < candidates->size; ++i) { - float p = expf(candidates->data[i].logit - max_l); - candidates->data[i].p = p; - cum_sum += p; - } - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].p /= cum_sum; - } - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) { - const int64_t t_start_sample_us = ggml_time_us(); - - k = std::max(k, (int) min_keep); - k = std::min(k, (int) candidates->size); - - // Sort scores in descending order - if (!candidates->sorted) { - auto comp = [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }; - if (k == (int) candidates->size) { - std::sort(candidates->data, candidates->data + candidates->size, comp); - } else { - std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); - } - candidates->sorted = true; - } - candidates->size = k; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - if (p >= 1.0f) { - return; - } - - llama_sample_softmax(ctx, candidates); - - const int64_t t_start_sample_us = ggml_time_us(); - - // Compute the cumulative probabilities - float cum_sum = 0.0f; - size_t last_idx = candidates->size; - - for (size_t i = 0; i < candidates->size; ++i) { - cum_sum += candidates->data[i].p; - - // Check if the running sum is at least p or if we have kept at least min_keep tokens - // we set the last index to i+1 to indicate that the current iterate should be included in the set - if (cum_sum >= p && i + 1 >= min_keep) { - last_idx = i + 1; - break; - } - } - - // Resize the output vector to keep only the top-p tokens - candidates->size = last_idx; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { - if (z >= 1.0f || candidates->size <= 2) { - return; - } - - llama_sample_softmax(nullptr, candidates); - const int64_t t_start_sample_us = ggml_time_us(); - - // Compute the first and second derivatives - std::vector first_derivatives(candidates->size - 1); - std::vector second_derivatives(candidates->size - 2); - - for (size_t i = 0; i < first_derivatives.size(); ++i) { - first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p; - } - for (size_t i = 0; i < second_derivatives.size(); ++i) { - second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; - } - - // Calculate absolute value of second derivatives - for (size_t i = 0; i < second_derivatives.size(); ++i) { - second_derivatives[i] = abs(second_derivatives[i]); - } - - // Normalize the second derivatives - { - const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); - - if (second_derivatives_sum > 1e-6f) { - for (float & value : second_derivatives) { - value /= second_derivatives_sum; - } - } else { - for (float & value : second_derivatives) { - value = 1.0f / second_derivatives.size(); - } - } - } - - float cum_sum = 0.0f; - size_t last_idx = candidates->size; - for (size_t i = 0; i < second_derivatives.size(); ++i) { - cum_sum += second_derivatives[i]; - - // Check if the running sum is greater than z or if we have kept at least min_keep tokens - if (cum_sum > z && i >= min_keep) { - last_idx = i; - break; - } - } - - // Resize the output vector to keep only the tokens above the tail location - candidates->size = last_idx; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - // Reference implementation: - // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr - if (p >= 1.0f) { - return; - } - - // Compute the softmax of logits and calculate entropy - llama_sample_softmax(nullptr, candidates); - - const int64_t t_start_sample_us = ggml_time_us(); - - float entropy = 0.0f; - for (size_t i = 0; i < candidates->size; ++i) { - entropy += -candidates->data[i].p * logf(candidates->data[i].p); - } - - // Compute the absolute difference between negative log probability and entropy for each candidate - std::vector shifted_scores; - for (size_t i = 0; i < candidates->size; ++i) { - float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy); - shifted_scores.push_back(shifted_score); - } - - // Sort tokens based on the shifted_scores and their corresponding indices - std::vector indices(candidates->size); - std::iota(indices.begin(), indices.end(), 0); - - std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { - return shifted_scores[a] < shifted_scores[b]; - }); - - // Compute the cumulative probabilities - float cum_sum = 0.0f; - size_t last_idx = indices.size(); - - for (size_t i = 0; i < indices.size(); ++i) { - size_t idx = indices[i]; - cum_sum += candidates->data[idx].p; - - // Check if the running sum is greater than typical or if we have kept at least min_keep tokens - if (cum_sum > p && i >= min_keep - 1) { - last_idx = i + 1; - break; - } - } - - // Resize the output vector to keep only the locally typical tokens - std::vector new_candidates; - for (size_t i = 0; i < last_idx; ++i) { - size_t idx = indices[i]; - new_candidates.push_back(candidates->data[idx]); - } - - // Replace the data in candidates with the new_candidates data - std::copy(new_candidates.begin(), new_candidates.end(), candidates->data); - candidates->size = new_candidates.size(); - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { - const int64_t t_start_sample_us = ggml_time_us(); - - for (size_t i = 0; i < candidates_p->size; ++i) { - candidates_p->data[i].logit /= temp; - } - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) { - if (last_tokens_size == 0 || penalty == 1.0f) { - return; - } - - const int64_t t_start_sample_us = ggml_time_us(); - - for (size_t i = 0; i < candidates->size; ++i) { - const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); - if (token_iter == last_tokens + last_tokens_size) { - continue; - } - - // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. - // This is common fix for this problem, which is to multiply by the penalty instead of dividing. - if (candidates->data[i].logit <= 0) { - candidates->data[i].logit *= penalty; - } else { - candidates->data[i].logit /= penalty; - } - } - - candidates->sorted = false; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { - if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { - return; - } - - const int64_t t_start_sample_us = ggml_time_us(); - - // Create a frequency map to count occurrences of each token in last_tokens - std::unordered_map token_count; - for (size_t i = 0; i < last_tokens_size; ++i) { - token_count[last_tokens_p[i]]++; - } - - // Apply frequency and presence penalties to the candidates - for (size_t i = 0; i < candidates->size; ++i) { - auto token_iter = token_count.find(candidates->data[i].id); - if (token_iter == token_count.end()) { - continue; - } - - int count = token_iter->second; - candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence; - } - - candidates->sorted = false; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) { - assert(ctx); - const int64_t t_start_sample_us = ggml_time_us(); - - bool allow_eos = false; - for (const auto & stack : grammar->stacks) { - if (stack.empty()) { - allow_eos = true; - break; - } - } - - const llama_token eos = llama_token_eos(); - - std::vector> candidates_decoded; - std::vector candidates_grammar; - - for (size_t i = 0; i < candidates->size; ++i) { - const llama_token id = candidates->data[i].id; - std::string str = llama_token_to_str(ctx, id); - if (id == eos) { - if (!allow_eos) { - candidates->data[i].logit = -INFINITY; - } - } else if (str.empty()) { - candidates->data[i].logit = -INFINITY; - } else { - candidates_decoded.push_back(decode_utf8(str.c_str())); - candidates_grammar.push_back({ i, candidates_decoded.back().data() }); - } - } - - const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); - for (const auto & reject : rejects) { - candidates->data[reject.index].logit = -INFINITY; - } - - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; -} - -static void llama_log_softmax(float * array, size_t size) { - float max_l = *std::max_element(array, array + size); - float sum = 0.f; - for (size_t i = 0; i < size; ++i) { - float p = expf(array[i] - max_l); - sum += p; - array[i] = p; - } - - for (size_t i = 0; i < size; ++i) { - array[i] = logf(array[i] / sum); - } -} - -void llama_sample_classifier_free_guidance( - struct llama_context * ctx, - llama_token_data_array * candidates, - struct llama_context * guidance_ctx, - float scale) { - int64_t t_start_sample_us = ggml_time_us(); - - assert(ctx); - auto n_vocab = llama_n_vocab(ctx); - assert(n_vocab == (int)candidates->size); - assert(!candidates->sorted); - - std::vector logits_base; - logits_base.reserve(candidates->size); - for (size_t i = 0; i < candidates->size; ++i) { - logits_base.push_back(candidates->data[i].logit); - } - llama_log_softmax(logits_base.data(), candidates->size); - - float* logits_guidance = llama_get_logits(guidance_ctx); - llama_log_softmax(logits_guidance, n_vocab); - - for (int i = 0; i < n_vocab; ++i) { - float logit_guidance = logits_guidance[i]; - float logit_base = logits_base[i]; - candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance; - } - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { - assert(ctx); - auto N = float(llama_n_vocab(ctx)); - int64_t t_start_sample_us; - t_start_sample_us = ggml_time_us(); - - llama_sample_softmax(nullptr, candidates); - - // Estimate s_hat using the most probable m tokens - float s_hat = 0.0; - float sum_ti_bi = 0.0; - float sum_ti_sq = 0.0; - for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) { - float t_i = logf(float(i + 2) / float(i + 1)); - float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p); - sum_ti_bi += t_i * b_i; - sum_ti_sq += t_i * t_i; - } - s_hat = sum_ti_bi / sum_ti_sq; - - // Compute k from the estimated s_hat and target surprise value - float epsilon_hat = s_hat - 1; - float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat); - - // Sample the next word X using top-k sampling - llama_sample_top_k(nullptr, candidates, int(k), 1); - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } - llama_token X = llama_sample_token(ctx, candidates); - t_start_sample_us = ggml_time_us(); - - // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { - return candidate.id == X; - })); - float observed_surprise = -log2f(candidates->data[X_idx].p); - float e = observed_surprise - tau; - - // Update mu using the learning rate and error - *mu = *mu - eta * e; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } - return X; -} - -llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) { - int64_t t_start_sample_us; - t_start_sample_us = ggml_time_us(); - - llama_sample_softmax(ctx, candidates); - - // Truncate the words with surprise values greater than mu - candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { - return -log2f(candidate.p) > *mu; - })); - - if (candidates->size == 0) { - candidates->size = 1; - } - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // Normalize the probabilities of the remaining words - llama_sample_softmax(ctx, candidates); - - // Sample the next word X from the remaining words - llama_token X = llama_sample_token(ctx, candidates); - t_start_sample_us = ggml_time_us(); - - // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { - return candidate.id == X; - })); - float observed_surprise = -log2f(candidates->data[X_idx].p); - float e = observed_surprise - tau; - - // Update mu using the learning rate and error - *mu = *mu - eta * e; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } - return X; -} - -llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) { - const int64_t t_start_sample_us = ggml_time_us(); - - // Find max element - auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { - return a.logit < b.logit; - }); - - llama_token result = max_iter->id; - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - ctx->n_sample++; - } - return result; -} - -llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { - assert(ctx); - const int64_t t_start_sample_us = ggml_time_us(); - llama_sample_softmax(nullptr, candidates); - - std::vector probs; - probs.reserve(candidates->size); - for (size_t i = 0; i < candidates->size; ++i) { - probs.push_back(candidates->data[i].p); - } - - std::discrete_distribution<> dist(probs.begin(), probs.end()); - auto & rng = ctx->rng; - int idx = dist(rng); - - llama_token result = candidates->data[idx].id; - - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - ctx->n_sample++; - return result; -} - -void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { - const int64_t t_start_sample_us = ggml_time_us(); - - if (token == llama_token_eos()) { - for (const auto & stack : grammar->stacks) { - if (stack.empty()) { - return; - } - } - GGML_ASSERT(false); - } - - std::string str = llama_token_to_str(ctx, token); - // Note terminating 0 in decoded string - auto code_points = decode_utf8(str.c_str()); - for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); - } - GGML_ASSERT(!grammar->stacks.empty()); - - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; -} - -// -// quantization -// - -static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector & output, const size_t nelements, const int nthread) { - if (output.size() < nelements) { - output.resize(nelements); - } - float * f32_output = (float *) output.data(); - - ggml_type_traits_t qtype; - if (ggml_is_quantized(tensor->type)) { - qtype = ggml_internal_get_type_traits(tensor->type); - if (qtype.to_float == NULL) { - throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); - } - } else if (tensor->type != GGML_TYPE_F16) { - throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type))); - } - - if (nthread < 2) { - if (tensor->type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); - } else if (ggml_is_quantized(tensor->type)) { - qtype.to_float(tensor->data, f32_output, nelements); - } else { - GGML_ASSERT(false); // unreachable - } - return; - } - - auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type); - auto block_size_bytes = ggml_type_size(tensor->type); - - GGML_ASSERT(nelements % block_size == 0); - auto nblocks = nelements / block_size; - auto blocks_per_thread = nblocks / nthread; - auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count - - std::vector workers; - for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) { - auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread - auto thr_elems = thr_blocks * block_size; // number of elements for this thread - auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread - - auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { - if (typ == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); - } else { - qtype.to_float(inbuf, outbuf, nels); - } - }; - workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); - in_buff_offs += thr_block_bytes; - out_buff_offs += thr_elems; - } - for (auto & worker : workers) { - worker.join(); - } -} - -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { - ggml_type quantized_type; - llama_ftype ftype = params->ftype; - int nthread = params->nthread; - - switch (params->ftype) { - case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; - case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; - case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; - case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; - case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; - case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; - case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; - -#ifdef GGML_USE_K_QUANTS - // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; - case LLAMA_FTYPE_MOSTLY_Q3_K_S: - case LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; - case LLAMA_FTYPE_MOSTLY_Q4_K_S: - case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; - case LLAMA_FTYPE_MOSTLY_Q5_K_S: - case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; - case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; -#endif - default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); - } - - if (nthread <= 0) { - nthread = std::thread::hardware_concurrency(); - } - - std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false)); - - const size_t align = GGUF_DEFAULT_ALIGNMENT; - struct gguf_context * ctx_out = gguf_init_empty(); - - // copy the KV pairs from the input file - gguf_set_kv (ctx_out, model_loader->ctx_gguf); - gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); - -#ifdef GGML_USE_K_QUANTS - int n_attention_wv = 0; - int n_feed_forward_w2 = 0; - - for (int i = 0; i < model_loader->n_tensors; ++i) { - struct ggml_tensor * meta = model_loader->get_tensor_meta(i); - - const std::string name = ggml_get_name(meta); - - if (name.find("attn_v.weight") != std::string::npos) { - ++n_attention_wv; - } - else if (name.find("ffn_down.weight") != std::string::npos) { - ++n_feed_forward_w2; - } - } - - int i_attention_wv = 0; - int i_feed_forward_w2 = 0; -#endif - - size_t total_size_org = 0; - size_t total_size_new = 0; - std::vector hist_all(1 << 4, 0); - - std::vector workers; - std::mutex mutex; - - auto use_more_bits = [] (int i_layer, int num_layers) -> bool { - return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; - }; - - int idx = 0; - - std::vector read_data; - std::vector work; - - // populate the original tensors so we get an initial meta data - for (int i = 0; i < model_loader->n_tensors; ++i) { - struct ggml_tensor * meta = model_loader->get_tensor_meta(i); - gguf_add_tensor(ctx_out, meta); - } - - std::ofstream fout(fname_out, std::ios::binary); - - const size_t meta_size = gguf_get_meta_size(ctx_out); - - LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); - - // placeholder for the meta data - ::zeros(fout, meta_size); - - for (int i = 0; i < model_loader->n_tensors; ++i) { - struct ggml_tensor * tensor = model_loader->get_tensor_meta(i); - - const std::string name = ggml_get_name(tensor); - - read_data.resize(ggml_nbytes(tensor)); - tensor->data = read_data.data(); - model_loader->load_data_for(tensor); - - LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", - ++idx, model_loader->n_tensors, - ggml_get_name(tensor), - llama_format_tensor_shape(tensor).c_str(), - ggml_type_name(tensor->type)); - - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - - // quantize only 2D tensors - quantize &= (tensor->n_dims == 2); - quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= quantized_type != tensor->type; - - enum ggml_type new_type; - void * new_data; - size_t new_size; - - if (!quantize) { - new_type = tensor->type; - new_data = tensor->data; - new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); - } else { - new_type = quantized_type; -#ifdef GGML_USE_K_QUANTS - if (name == TN_OUTPUT) { - int nx = tensor->ne[0]; - int ny = tensor->ne[1]; - if (nx % QK_K == 0 && ny % QK_K == 0) { - new_type = GGML_TYPE_Q6_K; - } - } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && - (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; - ++i_attention_wv; - } else if (name.find("feed_forward.w2.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; - ++i_feed_forward_w2; - } else if (name.find("attn_output.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - } - bool convert_incompatible_tensor = false; - if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { - int nx = tensor->ne[0]; - int ny = tensor->ne[1]; - if (nx % QK_K != 0 || ny % QK_K != 0) { - LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); - convert_incompatible_tensor = true; - } - } - if (convert_incompatible_tensor) { - if (name == TN_OUTPUT) { - new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. - LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); - } else if (name == TN_TOKEN_EMBD) { - new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. - LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); - } else { - throw std::runtime_error("Unsupported tensor size encountered\n"); - } - } -#endif - - const size_t nelements = ggml_nelements(tensor); - - float * f32_data; - std::vector f32_conv_buf; - - if (tensor->type == GGML_TYPE_F32) { - f32_data = (float *) tensor->data; - } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { - throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); - } else { - llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); - f32_data = (float *) f32_conv_buf.data(); - } - - LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type)); - fflush(stdout); - - work.resize(nelements * 4); // upper bound on size - new_data = work.data(); - std::vector hist_cur(1 << 4, 0); - - const int chunk_size = 32 * 512; - const int nchunk = (nelements + chunk_size - 1)/chunk_size; - const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; - if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); - } else { - size_t counter = 0; - new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () { - std::vector local_hist; - size_t local_size = 0; - while (true) { - std::unique_lock lock(mutex); - size_t first = counter; counter += chunk_size; - if (first >= nelements) { - if (!local_hist.empty()) { - for (int j=0; j %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); - int64_t tot_count = 0; - for (size_t i = 0; i < hist_cur.size(); i++) { - hist_all[i] += hist_cur[i]; - tot_count += hist_cur[i]; - } - - if (tot_count > 0) { - for (size_t i = 0; i < hist_cur.size(); i++) { - LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements)); - } - } - LLAMA_LOG_INFO("\n"); - } - total_size_org += ggml_nbytes(tensor); - total_size_new += new_size; - - // update the gguf meta data as we go - gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); - - // write tensor data + padding - fout.write((const char *) new_data, new_size); - zeros(fout, GGML_PAD(new_size, align) - new_size); - } - - // go back to beginning of file and write the updated meta data - { - fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_out)); - gguf_get_meta_data(ctx_out, data.data()); - fout.write((const char *) data.data(), data.size()); - } - - fout.close(); - - gguf_free(ctx_out); - - LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); - - // print histogram for all tensors - { - int64_t sum_all = 0; - for (size_t i = 0; i < hist_all.size(); i++) { - sum_all += hist_all[i]; - } - - if (sum_all > 0) { - LLAMA_LOG_INFO("%s: hist: ", __func__); - for (size_t i = 0; i < hist_all.size(); i++) { - LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all)); - } - LLAMA_LOG_INFO("\n"); - } - } -} - -int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { - LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - - const int64_t t_start_lora_us = ggml_time_us(); - - auto fin = std::ifstream(path_lora, std::ios::binary); - if (!fin) { - LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora); - return 1; - } - - // verify magic and version - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); - - if (format_version != 1) { - LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); - return 1; - } - } - - int32_t lora_r; - int32_t lora_alpha; - fin.read((char *) &lora_r, sizeof(lora_r)); - fin.read((char *) &lora_alpha, sizeof(lora_alpha)); - float scaling = (float)lora_alpha / (float)lora_r; - - LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - - // create a temporary ggml context to store the lora tensors - // todo: calculate size from biggest possible tensor - std::vector lora_buf(1024ull * 1024ull * 1024ull); - struct ggml_init_params params; - params.mem_size = lora_buf.size(); - params.mem_buffer = lora_buf.data(); - params.no_alloc = false; - - ggml_context * lora_ctx = ggml_init(params); - std::unordered_map lora_tensors; - - // create a name -> tensor map of the model to accelerate lookups - std::unordered_map model_tensors; - for (const auto & kv : model.tensors_by_name) { - model_tensors.insert(kv); - } - - // load base model - std::unique_ptr model_loader; - ggml_context * base_ctx = NULL; - std::vector base_buf; - if (path_base_model) { - LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); - - size_t ctx_size; - size_t mmapped_size; - model_loader->calc_sizes(ctx_size, mmapped_size); - base_buf.resize(ctx_size); - - ggml_init_params base_params; - base_params.mem_size = base_buf.size(); - base_params.mem_buffer = base_buf.data(); - base_params.no_alloc = model_loader->use_mmap; - - base_ctx = ggml_init(base_params); - - // maybe this should in llama_model_loader - if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file, /* prefetch */ 0, ggml_is_numa())); - } - } - - // read tensors and apply - bool warned = false; - int n_tensors = 0; - - std::vector work_buffer; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - if (fin.eof()) { - break; - } - - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - } - - std::string name; - { - char buf[1024]; - fin.read(buf, length); - name = std::string(buf, length); - } - - // check for lora suffix and get the type of tensor - const std::string lora_suffix = ".lora"; - size_t pos = name.rfind(lora_suffix); - if (pos == std::string::npos) { - LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); - return 1; - } - - std::string lora_type = name.substr(pos + lora_suffix.length()); - std::string base_name = name; - base_name.erase(pos); - // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); - - if (model_tensors.find(base_name) == model_tensors.end()) { - LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); - return 1; - } - - // create ggml tensor - ggml_type wtype; - switch (ftype) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - default: - { - LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", - __func__, ftype); - return false; - } - } - ggml_tensor * lora_tensor; - if (n_dims == 2) { - lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); - } - else { - LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; - } - ggml_set_name(lora_tensor, "lora_tensor"); - - // load tensor data - size_t offset = fin.tellg(); - size_t tensor_data_size = ggml_nbytes(lora_tensor); - offset = (offset + 31) & -32; - fin.seekg(offset); - fin.read((char*)lora_tensor->data, tensor_data_size); - - lora_tensors[name] = lora_tensor; - - // check if we have both A and B tensors and apply - if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && - lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { - - ggml_tensor * dest_t = model_tensors[base_name]; - - offload_func_t offload_func = llama_nop; - offload_func_t offload_func_force_inplace = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { - if (dest_t->type != GGML_TYPE_F16) { - throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); - } - offload_func = ggml_cuda_assign_buffers; - offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; - } -#endif // GGML_USE_CUBLAS - - ggml_tensor * base_t; - if (model_loader) { - struct gguf_context * ctx_gguf = model_loader->ctx_gguf; - - // load from base model - if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; - } - - // TODO: not tested!! maybe not working! - base_t = model_loader->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); - model_loader->load_data_for(base_t); - } else { - base_t = dest_t; - } - - if (ggml_is_quantized(base_t->type)) { - if (!warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; - } - } - - ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; - GGML_ASSERT(loraA->type == GGML_TYPE_F32); - ggml_set_name(loraA, "loraA"); - - ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; - GGML_ASSERT(loraB->type == GGML_TYPE_F32); - ggml_set_name(loraB, "loraB"); - - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); - return 1; - } - - // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); - offload_func(BA); - ggml_set_name(BA, "BA"); - - if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); - ggml_set_name(scale_tensor, "scale_tensor"); - - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); - offload_func(BA); - ggml_set_name(BA, "BA_scaled"); - } - - ggml_tensor * r; - if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx, dest_t, BA); - offload_func_force_inplace(r); - ggml_set_name(r, "r_add_inplace"); - } - else { - r = ggml_add(lora_ctx, base_t, BA); - offload_func(r); - ggml_set_name(r, "r_add"); - - r = ggml_cpy(lora_ctx, r, dest_t); - offload_func(r); - ggml_set_name(r, "r_cpy"); - } - - struct ggml_cgraph gf = ggml_build_forward(r); - - ggml_graph_compute_helper(work_buffer, &gf, n_threads); - - // we won't need these tensors again, reset the context to save memory - ggml_free(lora_ctx); - lora_ctx = ggml_init(params); - lora_tensors.clear(); - - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); - } - } - } - - // TODO: this should be in a destructor, it will leak on failure - ggml_free(lora_ctx); - if (base_ctx) { - ggml_free(base_ctx); - } - - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; - LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); - - return 0; -} - -// -// interface implementation -// - -struct llama_context_params llama_context_default_params() { - struct llama_context_params result = { - /*.seed =*/ LLAMA_DEFAULT_SEED, - /*.n_ctx =*/ 512, - /*.n_batch =*/ 512, - /*.gpu_layers =*/ 0, - /*.main_gpu =*/ 0, - /*.tensor_split =*/ nullptr, - /*.rope_freq_base =*/ 10000.0f, - /*.rope_freq_scale =*/ 1.0f, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, - /*.low_vram =*/ false, - /*.mul_mat_q =*/ false, - /*.f16_kv =*/ true, - /*.logits_all =*/ false, - /*.vocab_only =*/ false, - /*.use_mmap =*/ true, - /*.use_mlock =*/ false, - /*.embedding =*/ false, - }; - - return result; -} - -struct llama_model_quantize_params llama_model_quantize_default_params() { - struct llama_model_quantize_params result = { - /*.nthread =*/ 0, - /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, - /*.allow_requantize =*/ false, - /*.quantize_output_tensor =*/ true, - }; - - return result; -} - -int llama_max_devices(void) { - return LLAMA_MAX_DEVICES; -} - -bool llama_mmap_supported(void) { - return llama_mmap::SUPPORTED; -} - -bool llama_mlock_supported(void) { - return llama_mlock::SUPPORTED; -} - -void llama_backend_init(bool numa) { - ggml_time_init(); - - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - if (numa) { - ggml_numa_init(); - } - -#ifdef GGML_USE_MPI - ggml_mpi_backend_init(); -#endif -} - -void llama_backend_free(void) { -#ifdef GGML_USE_MPI - ggml_mpi_backend_free(); -#endif -} - -int64_t llama_time_us(void) { - return ggml_time_us(); -} - -struct llama_model * llama_load_model_from_file( - const char * path_model, - struct llama_context_params params) { - ggml_time_init(); - - llama_model * model = new llama_model; - - ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, - params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, - params.progress_callback, params.progress_callback_user_data)) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); - delete model; - return nullptr; - } - - return model; -} - -void llama_free_model(struct llama_model * model) { - delete model; -} - -struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params) { - - if (!model) { - return nullptr; - } - - llama_context * ctx = new llama_context(*model); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - unsigned cur_percentage = 0; - if (params.progress_callback == NULL) { - params.progress_callback_user_data = &cur_percentage; - params.progress_callback = [](float progress, void * ctx) { - unsigned * cur_percentage_p = (unsigned *) ctx; - unsigned percentage = (unsigned) (100 * progress); - while (percentage > *cur_percentage_p) { - *cur_percentage_p = percentage; - LLAMA_LOG_INFO("."); - if (percentage >= 100) { - LLAMA_LOG_INFO("\n"); - } - } - }; - } - - ctx->rng = std::mt19937(params.seed); - ctx->logits_all = params.logits_all; - - ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - - // reserve memory for context buffers - if (!params.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { - LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); - llama_free(ctx); - return nullptr; - } - - { - const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); - LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); - } - - const auto & hparams = ctx->model.hparams; - - // resized during inference - if (params.logits_all) { - ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); - } else { - ctx->logits.reserve(hparams.n_vocab); - } - - if (params.embedding){ - ctx->embedding.resize(hparams.n_embd); - } - -#ifdef LLAMA_USE_ALLOCATOR - { - static const size_t tensor_alignment = 32; - // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data - ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); - - // create measure allocator - ctx->alloc = ggml_allocr_new_measure(tensor_alignment); - - // build worst-case graph - int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); - int n_past = hparams.n_ctx - n_tokens; - llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); - - // measure memory requirements for the graph - size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - - LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); - - // debug - for comparison with scratch buffer - //size_t prev_req = - // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + - // MEM_REQ_SCRATCH1().at(ctx->model.type) + - // MEM_REQ_EVAL().at(ctx->model.type); - //LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); - - // recreate allocator with exact memory requirements - ggml_allocr_free(ctx->alloc); - - ctx->buf_alloc.resize(alloc_size); - ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment); - } -#else - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); -#endif - -#ifdef LLAMA_USE_SCRATCH - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); - ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); -#endif - } - -#ifdef GGML_USE_METAL - if (params.n_gpu_layers > 0) { - // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(1); - - if (!ctx->ctx_metal) { - LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); - llama_free(ctx); - return NULL; - } - - void * data_ptr = NULL; - size_t data_size = 0; - - if (params.use_mmap) { - data_ptr = ctx->model.mapping->addr; - data_size = ctx->model.mapping->size; - } else { - data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - data_size = ggml_get_mem_size (ctx->model.ctx); - } - - const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - - LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); - -#define LLAMA_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ - llama_free(ctx); \ - return NULL; \ - } - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].data, ctx->buf_scratch[0].size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].data, ctx->buf_scratch[1].size, 0)); -#undef LLAMA_METAL_CHECK_BUF - } -#endif - -#ifdef GGML_USE_MPI - ctx->ctx_mpi = ggml_mpi_init(); - - if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { - // Enter a blocking eval loop with dummy input, letting rank=0 drive the process - const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos()); - while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; - llama_backend_free(); - exit(1); - } -#endif - - return ctx; -} - -struct llama_context * llama_init_from_file( - const char * path_model, - struct llama_context_params params) { - - struct llama_model * model = llama_load_model_from_file(path_model, params); - if (!model) { - return nullptr; - } - struct llama_context * ctx = llama_new_context_with_model(model, params); - ctx->model_owner = true; - return ctx; -} - -void llama_free(struct llama_context * ctx) { - delete ctx; -} - -int llama_model_quantize( - const char * fname_inp, - const char * fname_out, - const llama_model_quantize_params * params) { - try { - llama_model_quantize_internal(fname_inp, fname_out, params); - return 0; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); - return 1; - } -} - -int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { - try { - return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - return 1; - } -} - -int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) { - try { - return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - return 1; - } -} - -int llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return ctx->kv_self.n; -} - -#define LLAMA_MAX_RNG_STATE (64*1024) - -void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) { - if (seed == LLAMA_DEFAULT_SEED) { - seed = time(NULL); - } - ctx->rng.seed(seed); -} - -// Returns the *maximum* size of the state -size_t llama_get_state_size(const struct llama_context * ctx) { - // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. - // for reference, std::mt19937(1337) serializes to 6701 bytes. - const size_t s_rng_size = sizeof(size_t); - const size_t s_rng = LLAMA_MAX_RNG_STATE; - const size_t s_logits_capacity = sizeof(size_t); - const size_t s_logits_size = sizeof(size_t); - const size_t s_logits = ctx->logits.capacity() * sizeof(float); - const size_t s_embedding_size = sizeof(size_t); - const size_t s_embedding = ctx->embedding.size() * sizeof(float); - const size_t s_kv_size = sizeof(size_t); - const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ctx->kv_self.buf.size; - - const size_t s_total = ( - + s_rng_size - + s_rng - + s_logits_capacity - + s_logits_size - + s_logits - + s_embedding_size - + s_embedding - + s_kv_size - + s_kv_ntok - + s_kv - ); - - return s_total; -} - -// llama_context_data -struct llama_data_context { - virtual void write(const void * src, size_t size) = 0; - virtual size_t get_size_written() = 0; - virtual ~llama_data_context() = default; -}; - -struct llama_data_buffer_context : llama_data_context { - uint8_t * ptr; - size_t size_written = 0; - - llama_data_buffer_context(uint8_t * p) : ptr(p) {} - - void write(const void * src, size_t size) override { - memcpy(ptr, src, size); - ptr += size; - size_written += size; - } - - size_t get_size_written() override { - return size_written; - } -}; - -struct llama_data_file_context : llama_data_context { - FILE * file; - size_t size_written = 0; - - llama_data_file_context(FILE * f) : file(f) {} - - void write(const void * src, size_t size) override { - fwrite(src, size, 1, file); - size_written += size; - } - - size_t get_size_written() override { - return size_written; - } -}; - -/** copy state data into either a buffer or file depending on the passed in context - * - * file context: - * llama_file file("/path", "wb"); - * llama_data_file_context data_ctx(&file); - * llama_copy_state_data(ctx, &data_ctx); - * - * buffer context: - * std::vector buf(max_size, 0); - * llama_data_buffer_context data_ctx(&buf.data()); - * llama_copy_state_data(ctx, &data_ctx); - * -*/ -void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { - // copy rng - { - std::stringstream rng_ss; - rng_ss << ctx->rng; - - const size_t rng_size = rng_ss.str().size(); - char rng_buf[LLAMA_MAX_RNG_STATE]; - - memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE); - memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); - - data_ctx->write(&rng_size, sizeof(rng_size)); - data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE); - } - - // copy logits - { - const size_t logits_cap = ctx->logits.capacity(); - const size_t logits_size = ctx->logits.size(); - - data_ctx->write(&logits_cap, sizeof(logits_cap)); - data_ctx->write(&logits_size, sizeof(logits_size)); - - if (logits_size) { - data_ctx->write(ctx->logits.data(), logits_size * sizeof(float)); - } - - // If there is a gap between the size and the capacity, write padding - size_t padding_size = (logits_cap - logits_size) * sizeof(float); - if (padding_size > 0) { - std::vector padding(padding_size, 0); // Create a buffer filled with zeros - data_ctx->write(padding.data(), padding_size); - } - } - - // copy embeddings - { - const size_t embedding_size = ctx->embedding.size(); - - data_ctx->write(&embedding_size, sizeof(embedding_size)); - - if (embedding_size) { - data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float)); - } - } - - // copy kv cache - { - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; - const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd_gqa(); - const int n_ctx = hparams.n_ctx; - - const size_t kv_size = kv_self.buf.size; - const int kv_ntok = llama_get_kv_cache_token_count(ctx); - - data_ctx->write(&kv_size, sizeof(kv_size)); - data_ctx->write(&kv_ntok, sizeof(kv_ntok)); - - if (kv_size) { - const size_t elt_size = ggml_element_size(kv_self.k); - - ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); - ggml_cgraph gf{}; - - ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); - std::vector kout3d_data(ggml_nbytes(kout3d), 0); - kout3d->data = kout3d_data.data(); - - ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); - std::vector vout3d_data(ggml_nbytes(vout3d), 0); - vout3d->data = vout3d_data.data(); - - ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, - elt_size*n_embd, elt_size*n_embd*n_ctx, 0); - - ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, - elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); - - ggml_free(cpy_ctx); - - // our data is now in the kout3d_data and vout3d_data buffers - // write them to file - data_ctx->write(kout3d_data.data(), kout3d_data.size()); - data_ctx->write(vout3d_data.data(), vout3d_data.size()); - } - } -} - -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { - llama_data_buffer_context data_ctx(dst); - llama_copy_state_data_internal(ctx, &data_ctx); - - return data_ctx.get_size_written(); -} - -// Sets the state reading from the specified source address -size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { - uint8_t * inp = src; - - // set rng - { - size_t rng_size; - char rng_buf[LLAMA_MAX_RNG_STATE]; - - memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); - memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE; - - std::stringstream rng_ss; - rng_ss.str(std::string(&rng_buf[0], rng_size)); - rng_ss >> ctx->rng; - - GGML_ASSERT(rng_ss.fail() == false); - } - - // set logits - { - size_t logits_cap; - size_t logits_size; - - memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); - memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - - GGML_ASSERT(ctx->logits.capacity() == logits_cap); - - if (logits_size) { - ctx->logits.resize(logits_size); - memcpy(ctx->logits.data(), inp, logits_size * sizeof(float)); - } - - inp += logits_cap * sizeof(float); - } - - // set embeddings - { - size_t embedding_size; - - memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); - - GGML_ASSERT(ctx->embedding.capacity() == embedding_size); - - if (embedding_size) { - memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); - inp += embedding_size * sizeof(float); - } - } - - // set kv cache - { - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; - const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd_gqa(); - const int n_ctx = hparams.n_ctx; - - size_t kv_size; - int kv_ntok; - - memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); - memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); - - if (kv_size) { - GGML_ASSERT(kv_self.buf.size == kv_size); - - const size_t elt_size = ggml_element_size(kv_self.k); - - ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); - ggml_cgraph gf{}; - - ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); - kin3d->data = (void *) inp; - inp += ggml_nbytes(kin3d); - - ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); - vin3d->data = (void *) inp; - inp += ggml_nbytes(vin3d); - - ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, - elt_size*n_embd, elt_size*n_embd*n_ctx, 0); - - ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, - elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); - - ggml_free(cpy_ctx); - } - - ctx->kv_self.n = kv_ntok; - } - - const size_t nread = inp - src; - const size_t max_size = llama_get_state_size(ctx); - - GGML_ASSERT(nread <= max_size); - - return nread; -} - -static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(path_session, "rb"); - GGML_UNUSED(ctx); - GGML_UNUSED(path_session); - GGML_UNUSED(tokens_out); - GGML_UNUSED(n_token_capacity); - GGML_UNUSED(n_token_count_out); - - // TODO: implement with GGUF format - return true; -} - -bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - try { - return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading session file: %s\n", err.what()); - return false; - } -} - -bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - llama_file file(path_session, "wb"); - GGML_UNUSED(ctx); - GGML_UNUSED(tokens); - GGML_UNUSED(n_token_count); - - // TODO: implement with GGUF format - - return true; -} - -int llama_eval( - struct llama_context * ctx, - const llama_token * tokens, - int n_tokens, - int n_past, - int n_threads) { - if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; - } - - // get a more accurate load time, upon first eval - // TODO: fix this - if (!ctx->has_evaluated_once) { - ctx->t_load_us = ggml_time_us() - ctx->t_start_us; - ctx->has_evaluated_once = true; - } - - return 0; -} - -int llama_eval_embd( - struct llama_context * ctx, - const float * embd, - int n_tokens, - int n_past, - int n_threads) { - if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; - } - - // get a more accurate load time, upon first eval - // TODO: fix this - if (!ctx->has_evaluated_once) { - ctx->t_load_us = ggml_time_us() - ctx->t_start_us; - ctx->has_evaluated_once = true; - } - - return 0; -} - -int llama_eval_export(struct llama_context * ctx, const char * fname) { - const int n_batch = 1; - const int n_ctx = 512 - n_batch; - - const std::vector tmp(n_batch, llama_token_bos()); - - if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; - } - - return 0; -} - -int llama_tokenize_with_model( - const struct llama_model * model, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - auto escape = llama_vocab_type(model->vocab) == "spm"; - auto res = llama_tokenize(model->vocab, text, add_bos, escape); - - if (n_max_tokens < (int) res.size()) { - LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); - return -((int) res.size()); - } - - for (size_t i = 0; i < res.size(); i++) { - tokens[i] = res[i]; - } - - return res.size(); -} - -int llama_tokenize( - struct llama_context * ctx, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); -} - -std::vector llama_tokenize( - struct llama_context * ctx, - const std::string & text, - bool add_bos) { - int length = text.length() + add_bos; - std::vector result(length); - length = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); - if (length < 0) { - result.resize(-length); - int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); - assert(check == -length); - GGML_UNUSED(check); - } else { - result.resize(length); - } - return result; -} - -int llama_tokenize_bpe( - struct llama_context * ctx, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false); - - if (n_max_tokens < (int) res.size()) { - LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); - return -((int) res.size()); - } - - for (size_t i = 0; i < res.size(); i++) { - tokens[i] = res[i]; - } - - return res.size(); -} - -std::vector llama_tokenize_bpe( - struct llama_context * ctx, - const std::string & text, - bool add_bos) { - int length = text.length() + add_bos; - std::vector result(length); - length = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos); - if (length < 0) { - result.resize(-length); - int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos); - assert(check == -length); - GGML_UNUSED(check); - } else { - result.resize(length); - } - return result; -} - -int llama_n_vocab_from_model(const struct llama_model * model) { - return model->vocab.id_to_token.size(); -} - -int llama_n_ctx_from_model(const struct llama_model * model) { - return model->hparams.n_ctx; -} - -int llama_n_embd_from_model(const struct llama_model * model) { - return model->hparams.n_embd; -} - -int llama_n_vocab(const struct llama_context * ctx) { - return ctx->model.vocab.id_to_token.size(); -} - -int llama_n_ctx(const struct llama_context * ctx) { - return ctx->model.hparams.n_ctx; -} - -int llama_n_embd(const struct llama_context * ctx) { - return ctx->model.hparams.n_embd; -} - -int llama_get_vocab_from_model( - const struct llama_model * model, - const char * * strings, - float * scores, - int capacity) { - int n = std::min(capacity, (int) model->vocab.id_to_token.size()); - for (int i = 0; ivocab.id_to_token[i].tok.c_str(); - scores[i] = model->vocab.id_to_token[i].score; - } - return n; -} - -int llama_get_vocab( - const struct llama_context * ctx, - const char * * strings, - float * scores, - int capacity) { - return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity); -} - -float * llama_get_logits(struct llama_context * ctx) { - return ctx->logits.data(); -} - -float * llama_get_embeddings(struct llama_context * ctx) { - return ctx->embedding.data(); -} - -int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * str, int length) { - if (0 <= token && token < llama_n_vocab_from_model(model)) { - if (llama_is_normal_token(model->vocab, token)) { - std::string result = model->vocab.id_to_token[token].tok; - if(llama_vocab_type(model->vocab) == "spm") { - result = llama_unescape_whitespace(result); - } - if (length < (int) result.length()) { - return -result.length(); - } - strncpy(str, result.c_str(), result.length()); - return result.length(); - } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT - if (length < 3) { - return -3; - } - strncpy(str, "\xe2\x96\x85", 3); - return 3; - } else if (llama_is_control_token(model->vocab, token)) { - ; - } else if (llama_is_byte_token(model->vocab, token)) { - if (length < 1) { - return -1; - } - str[0] = llama_byte_to_char(model->vocab, token); - str[1] = 0x00; - return 1; - } - } - return 0; -} - -int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * str, int length) { - return llama_token_to_str_with_model(&ctx->model, token, str, length); -} - -std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { - std::vector result(8, 0); - const int length = llama_token_to_str(ctx, token, result.data(), result.size()); - if (length < 0) { - result.resize(-length); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); - GGML_ASSERT(check == -length); - } else { - result.resize(length); - } - - return std::string(result.data(), result.size()); -} - -int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * str, int length) { - if (0 <= token && token < llama_n_vocab_from_model(&ctx->model)) { - std::string result = ctx->model.vocab.id_to_token[token].tok; - if (length < (int) result.length()) { - return -result.length(); - } - strncpy(str, result.c_str(), result.length()); - return result.length(); - } - return 0; -} - -std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) { - std::vector result(8, 0); - const int length = llama_token_to_str_bpe(ctx, token, result.data(), result.size()); - if (length < 0) { - result.resize(-length); - const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size()); - GGML_ASSERT(check == -length); - } else { - result.resize(length); - } - - return std::string(result.data(), result.size()); -} - -llama_token llama_token_bos(void) { - return 1; -} - -llama_token llama_token_eos(void) { - return 2; -} - -llama_token llama_token_nl(void) { - return 13; -} - -struct llama_timings llama_get_timings(struct llama_context * ctx) { - struct llama_timings result = { - /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, - /*.t_end_ms =*/ 1.00 * ggml_time_ms(), - /*.t_load_ms =*/ 1e-3 * ctx->t_load_us, - /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us, - /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us, - /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, - - /*.n_sample =*/ std::max(1, ctx->n_sample), - /*.n_p_eval =*/ std::max(1, ctx->n_p_eval), - /*.n_eval =*/ std::max(1, ctx->n_eval), - }; - - return result; -} - -void llama_print_timings(struct llama_context * ctx) { - const llama_timings timings = llama_get_timings(ctx); - - LLAMA_LOG_INFO("\n"); - LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); - LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); - LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); -} - -void llama_reset_timings(struct llama_context * ctx) { - ctx->t_start_us = ggml_time_us(); - ctx->t_sample_us = ctx->n_sample = 0; - ctx->t_eval_us = ctx->n_eval = 0; - ctx->t_p_eval_us = ctx->n_p_eval = 0; -} - -const char * llama_print_system_info(void) { - static std::string s; - - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; - s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - - return s.c_str(); -} - -// For internal test use -const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { - return ctx->model.tensors_by_name; -} - -void llama_log_set(llama_log_callback log_callback, void * user_data) { - g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; - g_state.log_callback_user_data = user_data; -} - -#if defined(_MSC_VER) && !defined(vsnprintf) -#define vsnprintf _vsnprintf -#endif - -static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { - va_list args_copy; - va_copy(args_copy, args); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - g_state.log_callback(level, buffer, g_state.log_callback_user_data); - } else { - char* buffer2 = new char[len+1]; - vsnprintf(buffer2, len+1, format, args_copy); - buffer2[len] = 0; - g_state.log_callback(level, buffer2, g_state.log_callback_user_data); - delete[] buffer2; - } - va_end(args_copy); -} - -static void llama_log_internal(llama_log_level level, const char * format, ...) { - va_list args; - va_start(args, format); - llama_log_internal_v(level, format, args); - va_end(args); -} - -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - fputs(text, stderr); - fflush(stderr); -} diff --git a/gguf-llama.h b/gguf-llama.h deleted file mode 100644 index d81a1b5de..000000000 --- a/gguf-llama.h +++ /dev/null @@ -1,505 +0,0 @@ -#ifndef LLAMA_H -#define LLAMA_H - -#include "ggml.h" -#ifdef GGML_USE_CUBLAS -#include "ggml-cuda.h" -#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES -#else -#define LLAMA_MAX_DEVICES 1 -#endif // GGML_USE_CUBLAS -#include -#include -#include - -#ifdef LLAMA_SHARED -# if defined(_WIN32) && !defined(__MINGW32__) -# ifdef LLAMA_BUILD -# define LLAMA_API __declspec(dllexport) -# else -# define LLAMA_API __declspec(dllimport) -# endif -# else -# define LLAMA_API __attribute__ ((visibility ("default"))) -# endif -#else -# define LLAMA_API -#endif - -#ifdef __GNUC__ -# define DEPRECATED(func, hint) func __attribute__((deprecated(hint))) -#elif defined(_MSC_VER) -# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func -#else -# define DEPRECATED(func, hint) func -#endif - -#define LLAMA_DEFAULT_SEED 0xFFFFFFFF - -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) -// Defined when llama.cpp is compiled with support for offloading model layers to GPU. -#define LLAMA_SUPPORTS_GPU_OFFLOAD -#endif - -#ifdef __cplusplus -extern "C" { -#endif - - // - // C interface - // - // TODO: show sample usage - // - - struct llama_model; - struct llama_context; - - typedef int llama_token; - - typedef struct llama_token_data { - llama_token id; // token id - float logit; // log-odds of the token - float p; // probability of the token - } llama_token_data; - - typedef struct llama_token_data_array { - llama_token_data * data; - size_t size; - bool sorted; - } llama_token_data_array; - - typedef void (*llama_progress_callback)(float progress, void *ctx); - - enum llama_log_level { - LLAMA_LOG_LEVEL_ERROR = 2, - LLAMA_LOG_LEVEL_WARN = 3, - LLAMA_LOG_LEVEL_INFO = 4 - }; - - // Signature for logging events - // Note that text includes the new line character at the end for most events. - // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it - // if it exists. - // It might not exist for progress report where '.' is output repeatedly. - typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data); - - struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - int32_t n_ctx; // text context - int32_t n_batch; // prompt processing batch size - int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors - - const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) - - // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency - float rope_freq_scale; // RoPE frequency scaling factor - - // called with a progress value between 0 and 1, pass NULL to disable - llama_progress_callback progress_callback; - // context pointer passed to the progress callback - void * progress_callback_user_data; - - // Keep the booleans together to avoid misalignment during copy-by-value. - bool low_vram; // if true, reduce VRAM usage at the cost of performance - bool mul_mat_q; // if true, use experimental mul_mat_q kernels - bool f16_kv; // use fp16 for KV cache - bool logits_all; // the llama_eval() call computes all logits, not just the last one - bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_mlock; // force system to keep model in RAM - bool embedding; // embedding mode only - }; - - // model file types - enum llama_ftype { - LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed - // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors - }; - - // model quantization parameters - typedef struct llama_model_quantize_params { - int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - } llama_model_quantize_params; - - // grammar types - struct llama_grammar; - - // grammar element type - enum llama_gretype { - // end of rule definition - LLAMA_GRETYPE_END = 0, - - // start of alternate definition for rule - LLAMA_GRETYPE_ALT = 1, - - // non-terminal element: reference to rule - LLAMA_GRETYPE_RULE_REF = 2, - - // terminal element: character (code point) - LLAMA_GRETYPE_CHAR = 3, - - // inverse char(s) ([^a], [^a-b] [^abc]) - LLAMA_GRETYPE_CHAR_NOT = 4, - - // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to - // be an inclusive range ([a-z]) - LLAMA_GRETYPE_CHAR_RNG_UPPER = 5, - - // modifies a preceding LLAMA_GRETYPE_CHAR or - // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) - LLAMA_GRETYPE_CHAR_ALT = 6, - }; - - typedef struct llama_grammar_element { - enum llama_gretype type; - uint32_t value; // Unicode code point or rule ID - } llama_grammar_element; - - // performance timing information - struct llama_timings { - double t_start_ms; - double t_end_ms; - double t_load_ms; - double t_sample_ms; - double t_p_eval_ms; - double t_eval_ms; - - int32_t n_sample; - int32_t n_p_eval; - int32_t n_eval; - }; - - LLAMA_API struct llama_context_params llama_context_default_params(void); - LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); - - LLAMA_API int llama_max_devices(void); - LLAMA_API bool llama_mmap_supported(void); - LLAMA_API bool llama_mlock_supported(void); - - // TODO: not great API - very likely to change - // Initialize the llama + ggml backend - // If numa is true, use NUMA optimizations - // Call once at the start of the program - LLAMA_API void llama_backend_init(bool numa); - // Call once at the end of the program - currently only used for MPI - LLAMA_API void llama_backend_free(void); - - LLAMA_API int64_t llama_time_us(void); - - LLAMA_API struct llama_model * llama_load_model_from_file( - const char * path_model, - struct llama_context_params params); - - LLAMA_API void llama_free_model(struct llama_model * model); - - LLAMA_API struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params); - - - // Frees all allocated memory - LLAMA_API void llama_free(struct llama_context * ctx); - - // Returns 0 on success - LLAMA_API int llama_model_quantize( - const char * fname_inp, - const char * fname_out, - const llama_model_quantize_params * params); - - // Apply a LoRA adapter to a loaded model - // path_base_model is the path to a higher quality model to use as a base for - // the layers modified by the adapter. Can be NULL to use the current loaded model. - // The model needs to be reloaded before applying a new adapter, otherwise the adapter - // will be applied on top of the previous one - // Returns 0 on success - LLAMA_API DEPRECATED(int llama_apply_lora_from_file( - struct llama_context * ctx, - const char * path_lora, - const char * path_base_model, - int n_threads), - "please use llama_model_apply_lora_from_file instead"); - - LLAMA_API int llama_model_apply_lora_from_file( - const struct llama_model * model, - const char * path_lora, - const char * path_base_model, - int n_threads); - - // Returns the number of tokens in the KV cache - LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); - - // Sets the current rng seed. - LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); - - // Returns the maximum size in bytes of the state (rng, logits, embedding - // and kv_cache) - will often be smaller after compacting tokens - LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); - - // Copies the state to the specified destination address. - // Destination needs to have allocated enough memory. - // Returns the number of bytes copied - LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); - - // Set the state reading from the specified address - // Returns the number of bytes read - LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); - - // Save/load session file - LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); - LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); - - // Run the llama inference to obtain the logits and probabilities for the next token. - // tokens + n_tokens is the provided batch of new tokens to process - // n_past is the number of tokens to use from previous eval calls - // Returns 0 on success - LLAMA_API int llama_eval( - struct llama_context * ctx, - const llama_token * tokens, - int n_tokens, - int n_past, - int n_threads); - - // Same as llama_eval, but use float matrix input directly. - LLAMA_API int llama_eval_embd( - struct llama_context * ctx, - const float * embd, - int n_tokens, - int n_past, - int n_threads); - - // Export a static computation graph for context of 511 and batch size of 1 - // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these - // parameters here to keep things simple - // IMPORTANT: do not use for anything else other than debugging and testing! - LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); - - // Convert the provided text into tokens. - // The tokens pointer must be large enough to hold the resulting tokens. - // Returns the number of tokens on success, no more than n_max_tokens - // Returns a negative number on failure - the number of tokens that would have been returned - // TODO: not sure if correct - LLAMA_API int llama_tokenize( - struct llama_context * ctx, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos); - - LLAMA_API int llama_tokenize_bpe( - struct llama_context * ctx, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos); - - LLAMA_API int llama_tokenize_with_model( - const struct llama_model * model, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos); - - LLAMA_API int llama_n_vocab(const struct llama_context * ctx); - LLAMA_API int llama_n_ctx (const struct llama_context * ctx); - LLAMA_API int llama_n_embd (const struct llama_context * ctx); - - LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); - LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); - LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); - - // Get the vocabulary as output parameters. - // Returns number of results. - LLAMA_API int llama_get_vocab( - const struct llama_context * ctx, - const char * * strings, - float * scores, - int capacity); - - LLAMA_API int llama_get_vocab_from_model( - const struct llama_model * model, - const char * * strings, - float * scores, - int capacity); - - // Token logits obtained from the last call to llama_eval() - // The logits for the last token are stored in the last row - // Can be mutated in order to change the probabilities of the next token - // Rows: n_tokens - // Cols: n_vocab - LLAMA_API float * llama_get_logits(struct llama_context * ctx); - - // Get the embeddings for the input - // shape: [n_embd] (1-dimensional) - LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); - - // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API int llama_token_to_str( - const struct llama_context * ctx, - llama_token token, - char * str, - int length); - - LLAMA_API int llama_token_to_str_bpe( - const struct llama_context * ctx, - llama_token token, - char * str, - int length); - - LLAMA_API int llama_token_to_str_with_model( - const struct llama_model * model, - llama_token token, - char * str, - int length); - // Special tokens - LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence - LLAMA_API llama_token llama_token_eos(void); // end-of-sentence - LLAMA_API llama_token llama_token_nl(void); // next-line - - // Grammar - // - LLAMA_API struct llama_grammar * llama_grammar_init( - const llama_grammar_element ** rules, - size_t n_rules, - size_t start_rule_index); - - LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); - - // Sampling functions - - /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); - - /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); - - /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 - /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. - /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. - /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. - LLAMA_API void llama_sample_classifier_free_guidance( - struct llama_context * ctx, - llama_token_data_array * candidates, - struct llama_context * guidance_ctx, - float scale); - - /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. - LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); - - /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); - - /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); - - /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. - LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); - - /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. - LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); - LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); - - /// @details Apply constraints from grammar - LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar); - - /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. - /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. - /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. - /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); - - /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. - /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. - /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); - - /// @details Selects the token with the highest probability. - LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); - - /// @details Randomly selects a token from the candidates based on their probabilities. - LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); - - /// @details Accepts the sampled token into the grammar - LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token); - - // Performance information - LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); - LLAMA_API void llama_print_timings(struct llama_context * ctx); - LLAMA_API void llama_reset_timings(struct llama_context * ctx); - - // Print system information - LLAMA_API const char * llama_print_system_info(void); - - // Set callback for all future logging events. - // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); - -#ifdef __cplusplus -} -#endif - -// C++ API, will be moving to common.h soon (TM) -#ifdef LLAMA_API_CPP - -#include -#include - -// -// Vocab utils -// - -std::vector llama_tokenize( - struct llama_context * ctx, - const std::string & text, - bool add_bos); - -std::vector llama_tokenize_bpe( - struct llama_context * ctx, - const std::string & text, - bool add_bos); - -std::string llama_token_to_str( - const struct llama_context * ctx, - llama_token token); - -std::string llama_token_to_str_bpe( - const struct llama_context * ctx, - llama_token token); - -// Internal API to be implemented by llama.cpp and used by tests/benchmarks only -#ifdef LLAMA_API_INTERNAL - -struct ggml_tensor; - -const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); - -#endif // LLAMA_API_CPP - -#endif // LLAMA_API_INTERNAL - -#endif // LLAMA_H diff --git a/llama-util.h b/llama-util.h deleted file mode 100644 index 75e19c50c..000000000 --- a/llama-util.h +++ /dev/null @@ -1,553 +0,0 @@ -// Internal header to be included only by llama.cpp. -// Contains wrappers around OS interfaces. - -#ifndef LLAMA_UTIL_H -#define LLAMA_UTIL_H - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#ifdef __has_include - #if __has_include() - #include - #if defined(_POSIX_MAPPED_FILES) - #include - #endif - #if defined(_POSIX_MEMLOCK_RANGE) - #include - #endif - #endif -#endif - -#if defined(_WIN32) - #define WIN32_LEAN_AND_MEAN - #ifndef NOMINMAX - #define NOMINMAX - #endif - #include - #include - #include // for _fseeki64 -#endif - -#define LLAMA_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - -#ifdef __GNUC__ -#ifdef __MINGW32__ -__attribute__((format(gnu_printf, 1, 2))) -#else -__attribute__((format(printf, 1, 2))) -#endif -#endif -static std::string format(const char * fmt, ...) { - va_list ap, ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - LLAMA_ASSERT(size >= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - LLAMA_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - LLAMA_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - LLAMA_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t len) const { - if (len == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, len, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); - } - if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t len) const { - if (len == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, len, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -// llama_context_data -struct llama_data_context { - virtual void write(const void * src, size_t size) = 0; - virtual size_t get_size_written() = 0; - virtual ~llama_data_context() = default; -}; - -struct llama_data_buffer_context : llama_data_context { - uint8_t* ptr; - size_t size_written = 0; - - llama_data_buffer_context(uint8_t * p) : ptr(p) {} - - void write(const void * src, size_t size) override { - memcpy(ptr, src, size); - ptr += size; - size_written += size; - } - - size_t get_size_written() override { - return size_written; - } -}; - -struct llama_data_file_context : llama_data_context { - llama_file* file; - size_t size_written = 0; - - llama_data_file_context(llama_file * f) : file(f) {} - - void write(const void * src, size_t size) override { - file->write_raw(src, size); - size_written += size; - } - - size_t get_size_written() override { - return size_written; - } -}; - -#if defined(_WIN32) -static std::string llama_format_win_err(DWORD err) { - LPSTR buf; - size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); - if (!size) { - return "FormatMessageA failed"; - } - std::string ret(buf, size); - LocalFree(buf); - return ret; -} -#endif - -struct llama_mmap { - void * addr; - size_t size; - - llama_mmap(const llama_mmap &) = delete; - -#ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { - size = file->size; - int fd = fileno(file->fp); - int flags = MAP_SHARED; - // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } -#ifdef __linux__ - if (prefetch >= file->size) { flags |= MAP_POPULATE; } -#endif - addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { - throw std::runtime_error(format("mmap failed: %s", strerror(errno))); - } - - if (prefetch > 0) { - // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); - } - } - if (numa) { - // advise the kernel not to use readahead - // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", - strerror(errno)); - } - } - } - - ~llama_mmap() { - munmap(addr, size); - } -#elif defined(_WIN32) - static constexpr bool SUPPORTED = true; - - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) numa; - - size = file->size; - - HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); - - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - DWORD error = GetLastError(); - - if (hMapping == NULL) { - throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); - } - - addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - error = GetLastError(); - CloseHandle(hMapping); - - if (addr == NULL) { - throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); - } - - if (prefetch) { - // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we - // will dynamically load it using GetProcAddress. - BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); - HMODULE hKernel32; - - // This call is guaranteed to succeed. - hKernel32 = GetModuleHandleW(L"kernel32.dll"); - - // This call may fail if on a pre-Win8 system. - pPrefetchVirtualMemory = reinterpret_cast (GetProcAddress(hKernel32, "PrefetchVirtualMemory")); - - if (pPrefetchVirtualMemory) { - // Advise the kernel to preload the mapped memory. - WIN32_MEMORY_RANGE_ENTRY range; - range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; - if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - } - } - } - } - - ~llama_mmap() { - if (!UnmapViewOfFile(addr)) { - fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - } - } -#else - static constexpr bool SUPPORTED = false; - - llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) { - (void) prefetch; - (void) numa; - - throw std::runtime_error(std::string("mmap not supported")); - } -#endif -}; - -// Represents some region of memory being locked using mlock or VirtualLock; -// will automatically unlock on destruction. -struct llama_mlock { - void * addr = NULL; - size_t size = 0; - bool failed_already = false; - - llama_mlock() {} - llama_mlock(const llama_mlock &) = delete; - - ~llama_mlock() { - if (size) { - raw_unlock(addr, size); - } - } - - void init(void * ptr) { - LLAMA_ASSERT(addr == NULL && size == 0); - addr = ptr; - } - - void grow_to(size_t target_size) { - LLAMA_ASSERT(addr); - if (failed_already) { - return; - } - size_t granularity = lock_granularity(); - target_size = (target_size + granularity - 1) & ~(granularity - 1); - if (target_size > size) { - if (raw_lock((uint8_t *) addr + size, target_size - size)) { - size = target_size; - } else { - failed_already = true; - } - } - } - -#ifdef _POSIX_MEMLOCK_RANGE - static constexpr bool SUPPORTED = true; - - size_t lock_granularity() { - return (size_t) sysconf(_SC_PAGESIZE); - } - - #ifdef __APPLE__ - #define MLOCK_SUGGESTION \ - "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ - "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" - #else - #define MLOCK_SUGGESTION \ - "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" - #endif - - bool raw_lock(const void * addr, size_t size) { - if (!mlock(addr, size)) { - return true; - } else { - char* errmsg = std::strerror(errno); - bool suggest = (errno == ENOMEM); - - // Check if the resource limit is fine after all - struct rlimit lock_limit; - if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) - suggest = false; - if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) - suggest = false; - - fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", - size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); - return false; - } - } - - #undef MLOCK_SUGGESTION - - void raw_unlock(void * addr, size_t size) { - if (munlock(addr, size)) { - fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); - } - } -#elif defined(_WIN32) - static constexpr bool SUPPORTED = true; - - size_t lock_granularity() { - SYSTEM_INFO si; - GetSystemInfo(&si); - return (size_t) si.dwPageSize; - } - - bool raw_lock(void * ptr, size_t len) { - for (int tries = 1; ; tries++) { - if (VirtualLock(ptr, len)) { - return true; - } - if (tries == 2) { - fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", - len, size, llama_format_win_err(GetLastError()).c_str()); - return false; - } - - // It failed but this was only the first try; increase the working - // set size and try again. - SIZE_T min_ws_size, max_ws_size; - if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { - fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - return false; - } - // Per MSDN: "The maximum number of pages that a process can lock - // is equal to the number of pages in its minimum working set minus - // a small overhead." - // Hopefully a megabyte is enough overhead: - size_t increment = len + 1048576; - // The minimum must be <= the maximum, so we need to increase both: - min_ws_size += increment; - max_ws_size += increment; - if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { - fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - return false; - } - } - } - - void raw_unlock(void * ptr, size_t len) { - if (!VirtualUnlock(ptr, len)) { - fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", - llama_format_win_err(GetLastError()).c_str()); - } - } -#else - static constexpr bool SUPPORTED = false; - - size_t lock_granularity() { - return (size_t) 65536; - } - - bool raw_lock(const void * addr, size_t len) { - fprintf(stderr, "warning: mlock not supported on this system\n"); - return false; - } - - void raw_unlock(const void * addr, size_t len) {} -#endif -}; - -// Replacement for std::vector that doesn't require zero-initialization. -struct llama_buffer { - uint8_t * addr = NULL; - size_t size = 0; - - llama_buffer() = default; - - void resize(size_t len) { -#ifdef GGML_USE_METAL - free(addr); - int result = posix_memalign((void **) &addr, getpagesize(), len); - if (result == 0) { - memset(addr, 0, len); - } - else { - addr = NULL; - } -#else - delete[] addr; - addr = new uint8_t[len]; -#endif - size = len; - } - - ~llama_buffer() { -#ifdef GGML_USE_METAL - free(addr); -#else - delete[] addr; -#endif - addr = NULL; - } - - // disable copy and move - llama_buffer(const llama_buffer&) = delete; - llama_buffer(llama_buffer&&) = delete; - llama_buffer& operator=(const llama_buffer&) = delete; - llama_buffer& operator=(llama_buffer&&) = delete; -}; - -#ifdef GGML_USE_CUBLAS -#include "ggml-cuda.h" -struct llama_ctx_buffer { - uint8_t * addr = NULL; - bool is_cuda; - size_t size = 0; - - llama_ctx_buffer() = default; - - void resize(size_t size) { - free(); - - addr = (uint8_t *) ggml_cuda_host_malloc(size); - if (addr) { - is_cuda = true; - } - else { - // fall back to pageable memory - addr = new uint8_t[size]; - is_cuda = false; - } - this->size = size; - } - - void free() { - if (addr) { - if (is_cuda) { - ggml_cuda_host_free(addr); - } - else { - delete[] addr; - } - } - addr = NULL; - } - - ~llama_ctx_buffer() { - free(); - } - - // disable copy and move - llama_ctx_buffer(const llama_ctx_buffer&) = delete; - llama_ctx_buffer(llama_ctx_buffer&&) = delete; - llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete; - llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete; -}; -#else -typedef llama_buffer llama_ctx_buffer; -#endif - -#endif diff --git a/llama.cpp b/llama.cpp index fb2207f84..38a2d5ba8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6,96 +6,154 @@ #include #endif -#include "llama-util.h" #define LLAMA_API_CPP // TODO: eliminate me #include "llama.h" #include "ggml.h" + +#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) +# include "ggml-alloc.h" +# define LLAMA_USE_ALLOCATOR +#else +# define LLAMA_USE_SCRATCH +# define LLAMA_MAX_SCRATCH_BUFFERS 16 +#endif + #ifdef GGML_USE_CUBLAS -#include "ggml-cuda.h" +# include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) -#include "ggml-opencl.h" +# include "ggml-opencl.h" #endif #ifdef GGML_USE_METAL -#include "ggml-metal.h" +# include "ggml-metal.h" #endif #ifdef GGML_USE_MPI -#include "ggml-mpi.h" +# include "ggml-mpi.h" #endif #ifdef GGML_USE_K_QUANTS -#ifndef QK_K -#ifdef GGML_QKK_64 -#define QK_K 64 -#else -#define QK_K 256 -#endif -#endif +# ifndef QK_K +# ifdef GGML_QKK_64 +# define QK_K 64 +# else +# define QK_K 256 +# endif +# endif +#endif + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include + #include // for _fseeki64 #endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include +#include +#include #include -#include #include +#include +#include +#include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif -static void llama_log_internal(llama_log_level level, const char* format, ...); +// tensor names +#define TN_TOKEN_EMBD "token_embd.weight" +#define TN_OUTPUT_NORM "output_norm.weight" +#define TN_OUTPUT "output.weight" +#define TN_ATTN_NORM "blk.%d.attn_norm.weight" +#define TN_ATTN_Q "blk.%d.attn_q.weight" +#define TN_ATTN_K "blk.%d.attn_k.weight" +#define TN_ATTN_V "blk.%d.attn_v.weight" +#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" +#define TN_FFN_NORM "blk.%d.ffn_norm.weight" +#define TN_FFN_GATE "blk.%d.ffn_gate.weight" +#define TN_FFN_DOWN "blk.%d.ffn_down.weight" +#define TN_FFN_UP "blk.%d.ffn_up.weight" + +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) +#endif + +// +// logging +// +LLAMA_ATTRIBUTE_FORMAT(2, 3) +static void llama_log_internal (llama_log_level level, const char* format, ...); static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); + #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) +// +// helpers +// -#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) -#include "ggml-alloc.h" -#define LLAMA_USE_ALLOCATOR -#else -#define LLAMA_USE_SCRATCH -#define LLAMA_MAX_SCRATCH_BUFFERS 16 -#endif +template +static std::string to_string(const T & val) { + std::stringstream ss; + ss << val; + return ss.str(); +} -#define UNUSED GGML_UNUSED +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} -// available llama models -enum e_model { - MODEL_UNKNOWN, - MODEL_3B, - MODEL_7B, - MODEL_13B, - MODEL_30B, - MODEL_65B, - MODEL_70B, -}; - -static const size_t kB = 1024; -static const size_t MB = 1024*1024; - -// computed for n_ctx == 2048 -// TODO: dynamically determine these sizes -// needs modifications in ggml - -typedef void (*offload_func_t)(struct ggml_tensor * tensor); - -void llama_nop(struct ggml_tensor * tensor) { // don't offload by default - (void) tensor; +LLAMA_ATTRIBUTE_FORMAT(1, 2) +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); } // @@ -113,10 +171,439 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * ggml_graph_compute(graph, &plan); } +// +// llama helpers +// + +#ifdef GGML_USE_CUBLAS +# define llama_host_malloc(n) ggml_cuda_host_malloc(n) +# define llama_host_free(data) ggml_cuda_host_free(data) +#elif GGML_USE_METAL +# define llama_host_malloc(n) ggml_metal_host_malloc(n) +# define llama_host_free(data) ggml_metal_host_free(data) +#else +# define llama_host_malloc(n) malloc(n) +# define llama_host_free(data) free(data) +#endif + +#if defined(_WIN32) +static std::string llama_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; +} +#endif + +struct llama_buffer { + void * data = NULL; + size_t size = 0; + + // fallback to malloc / free + // useful in cases where CUDA can try to allocate PINNED memory + bool fallback = false; + + void resize(size_t n) { + llama_host_free(data); + + data = llama_host_malloc(n); + if (!data) { + fallback = true; + data = malloc(n); + } else { + fallback = false; + } + + GGML_ASSERT(data); + size = n; + } + + ~llama_buffer() { + if (data) { + if (fallback) { // NOLINT + free(data); + } else { + llama_host_free(data); + } + } + + data = NULL; + } +}; + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) const { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, len, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error(std::string("unexpectedly reached end of file")); + } + } + + uint32_t read_u32() { + uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + void write_raw(const void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, len, 1, fp); + if (ret != 1) { + throw std::runtime_error(format("write error: %s", strerror(errno))); + } + } + + void write_u32(std::uint32_t val) const { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +struct llama_mmap { + void * addr; + size_t size; + + llama_mmap(const llama_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); + } + + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + } + + ~llama_mmap() { + munmap(addr, size); + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { + (void) numa; + + size = file->size; + + HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + DWORD error = GetLastError(); + + if (hMapping == NULL) { + throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); + } + + #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 + if (prefetch) { + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } + #else + #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") + #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 + } + + ~llama_mmap() { + if (!UnmapViewOfFile(addr)) { + fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { + (void) file; + (void) prefetch; + (void) numa; + + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +// Represents some region of memory being locked using mlock or VirtualLock; +// will automatically unlock on destruction. +struct llama_mlock { + void * addr = NULL; + size_t size = 0; + + bool failed_already = false; + + llama_mlock() {} + llama_mlock(const llama_mlock &) = delete; + + ~llama_mlock() { + if (size) { + raw_unlock(addr, size); + } + } + + void init(void * ptr) { + GGML_ASSERT(addr == NULL && size == 0); // NOLINT + addr = ptr; + } + + void grow_to(size_t target_size) { + GGML_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + + static size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + #ifdef __APPLE__ + #define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + #else + #define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + #endif + + bool raw_lock(const void * addr, size_t size) const { + if (!mlock(addr, size)) { + return true; + } + + char* errmsg = std::strerror(errno); + bool suggest = (errno == ENOMEM); + + // Check if the resource limit is fine after all + struct rlimit lock_limit; + if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) { + suggest = false; + } + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) { + suggest = false; + } + + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + return false; + } + + #undef MLOCK_SUGGESTION + + static void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + static size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * ptr, size_t len) const { + for (int tries = 1; ; tries++) { + if (VirtualLock(ptr, len)) { + return true; + } + if (tries == 2) { + fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + len, size, llama_format_win_err(GetLastError()).c_str()); + return false; + } + + // It failed but this was only the first try; increase the working + // set size and try again. + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + // Per MSDN: "The maximum number of pages that a process can lock + // is equal to the number of pages in its minimum working set minus + // a small overhead." + // Hopefully a megabyte is enough overhead: + size_t increment = len + 1048576; + // The minimum must be <= the maximum, so we need to increase both: + min_ws_size += increment; + max_ws_size += increment; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + static void raw_unlock(void * ptr, size_t len) { + if (!VirtualUnlock(ptr, len)) { + fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + static size_t lock_granularity() { + return (size_t) 65536; + } + + bool raw_lock(const void * addr, size_t len) const { + fprintf(stderr, "warning: mlock not supported on this system\n"); + return false; + } + + static void raw_unlock(const void * addr, size_t len) {} +#endif +}; + +typedef void (*offload_func_t)(struct ggml_tensor * tensor); + +void llama_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} + +// +// globals +// + +struct llama_state { + // We save the log callback globally + llama_log_callback log_callback = llama_log_callback_default; + void * log_callback_user_data = nullptr; +}; + +static llama_state g_state; + // // memory sizes (calculated for n_batch == 512) // +// computed for n_ctx == 2048 +// TODO: dynamically determine these sizes +// needs modifications in ggml + +// available llama models +enum e_model { + MODEL_UNKNOWN, + MODEL_3B, + MODEL_7B, + MODEL_13B, + MODEL_30B, + MODEL_65B, + MODEL_70B, +}; + +static const size_t kB = 1024; +static const size_t MB = 1024*1024; + static const std::map & MEM_REQ_SCRATCH0(int n_ctx) { static std::map k_sizes = { @@ -190,18 +677,15 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() // default hparams (LLaMA 7B) struct llama_hparams { uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_ctx = 512; uint32_t n_embd = 4096; - uint32_t n_mult = 256; uint32_t n_head = 32; uint32_t n_head_kv = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; + uint32_t n_ff = 11008; - // LLaMAv2 - // TODO: load from model data hparams - float f_ffn_mult = 1.0f; - float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; + float f_norm_rms_eps = 1e-5; float rope_freq_base = 10000.0f; float rope_freq_scale = 1.0f; @@ -259,7 +743,7 @@ struct llama_kv_cache { struct ggml_context * ctx = NULL; - llama_ctx_buffer buf; + llama_buffer buf; int n; // number of tokens currently in the cache @@ -276,6 +760,11 @@ struct llama_kv_cache { }; struct llama_vocab { + // TODO: + // - add a vector of merges + // - add members for bos/eos/pad/sep tokens + // so that we can pass it to different types of tokenizers with a common interface + using id = int32_t; using token = std::string; @@ -292,6 +781,7 @@ struct llama_model { e_model type = MODEL_UNKNOWN; llama_hparams hparams; + llama_vocab vocab; struct ggml_tensor * tok_embeddings; @@ -305,7 +795,7 @@ struct llama_model { struct ggml_context * ctx = NULL; // the model memory buffer - llama_ctx_buffer buf; + llama_buffer buf; // model memory mapped file std::unique_ptr mapping; @@ -320,8 +810,6 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; - llama_vocab vocab; - ~llama_model() { if (ctx) { ggml_free(ctx); @@ -394,15 +882,16 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state - llama_ctx_buffer buf_compute; + llama_buffer buf_compute; #ifdef LLAMA_USE_ALLOCATOR - llama_ctx_buffer buf_alloc; + llama_buffer buf_alloc; ggml_allocr * alloc = NULL; #endif #ifdef LLAMA_USE_SCRATCH - llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; #endif @@ -415,7 +904,7 @@ struct llama_context { ggml_mpi_context * ctx_mpi = NULL; #endif - void use_buf(struct ggml_context * ctx, int i) { + void use_buf(struct ggml_context * ctx, int i) { // NOLINT #if defined(LLAMA_USE_SCRATCH) size_t last_size = 0; @@ -423,7 +912,7 @@ struct llama_context { last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); } else { auto & buf = buf_scratch[i]; - last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); + last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.data, }); } if (buf_last >= 0) { @@ -437,7 +926,7 @@ struct llama_context { #endif } - size_t get_buf_max_mem(int i) const { + size_t get_buf_max_mem(int i) { // NOLINT #if defined(LLAMA_USE_SCRATCH) return buf_max_size[i]; #else @@ -447,419 +936,11 @@ struct llama_context { } }; -struct llama_state { - // We save the log callback globally - llama_log_callback log_callback = llama_log_callback_default; - void * log_callback_user_data = nullptr; -}; -// global state -static llama_state g_state; - -template -static T checked_mul(T a, T b) { - T ret = a * b; - if (a != 0 && ret / a != b) { - throw std::runtime_error(format("overflow multiplying %llu * %llu", - (unsigned long long) a, (unsigned long long) b)); - } - return ret; -} - -static size_t checked_div(size_t a, size_t b) { - if (b == 0 || a % b != 0) { - throw std::runtime_error(format("error dividing %zu / %zu", a, b)); - } - return a / b; -} - -static std::string llama_format_tensor_shape(const std::vector & ne) { - char buf[256]; - snprintf(buf, sizeof(buf), "%5u", ne.at(0)); - for (size_t i = 1; i < ne.size(); i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i)); - } - return buf; -} - -static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { - size_t size = ggml_type_size(type); - for (uint32_t dim : ne) { - size = checked_mul(size, dim); - } - return size / ggml_blck_size(type); -} - -struct llama_load_tensor { - std::string name; - enum ggml_type type = GGML_TYPE_F32; - std::vector ne; - size_t file_off; - size_t size; - struct ggml_tensor * ggml_tensor = NULL; - uint8_t * data; -}; - -struct llama_load_tensors_map { - // tensors is kept in a separate vector to preserve file order - std::vector tensors; - std::unordered_map name_to_idx; -}; - -enum llama_file_version { - LLAMA_FILE_VERSION_GGML, - LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab - LLAMA_FILE_VERSION_GGJT_V1, // added padding - LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format - LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format -}; - -struct llama_file_loader { - llama_file file; - llama_file_version file_version; - llama_hparams hparams; - llama_vocab vocab; - - llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map) - : file(fname, "rb") { - LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname); - read_magic(); - read_hparams(); - read_vocab(); - read_tensor_metadata(tensors_map); - } - void read_magic() { - uint32_t magic = file.read_u32(); - - if (magic == LLAMA_FILE_MAGIC_GGML) { - file_version = LLAMA_FILE_VERSION_GGML; - return; - } - - uint32_t version = file.read_u32(); - - switch (magic) { - case LLAMA_FILE_MAGIC_GGMF: - switch (version) { - case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return; - } - break; - case LLAMA_FILE_MAGIC_GGJT: - switch (version) { - case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return; - case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return; - case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return; - } - } - - throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", - magic, version)); - } - void read_hparams() { - hparams.n_vocab = file.read_u32(); - hparams.n_embd = file.read_u32(); - hparams.n_mult = file.read_u32(); - hparams.n_head = file.read_u32(); - hparams.n_layer = file.read_u32(); - hparams.n_rot = file.read_u32(); - hparams.ftype = (enum llama_ftype) file.read_u32(); - - // LLaMAv2 - // TODO: read from header - hparams.n_head_kv = hparams.n_head; - } - void read_vocab() { - vocab.id_to_token.resize(hparams.n_vocab); - - for (uint32_t i = 0; i < hparams.n_vocab; i++) { - uint32_t len = file.read_u32(); - std::string word = file.read_string(len); - - float score = 0.0f; - file.read_raw(&score, sizeof(score)); - - GGML_ASSERT(vocab.token_to_id.find(word) == vocab.token_to_id.end()); - vocab.token_to_id[word] = i; - - auto & tok_score = vocab.id_to_token[i]; - tok_score.tok = std::move(word); - tok_score.score = score; - } - } - void read_tensor_metadata(llama_load_tensors_map & tensors_map) { - while (file.tell() < file.size) { - llama_load_tensor tensor; - uint32_t n_dims = file.read_u32(); - uint32_t name_len = file.read_u32(); - tensor.type = (enum ggml_type) file.read_u32(); - tensor.ne.resize(n_dims); - file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims); - std::string name = file.read_string(name_len); - if (n_dims < 1 || n_dims > 2) { - throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); - } - switch (tensor.type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - break; - default: { - throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type)); - } - } - - // skip to the next multiple of 32 bytes - if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { - file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); - } - - tensor.file_off = file.tell(); - tensor.name = name; - tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type); - file.seek(tensor.size, SEEK_CUR); - - tensors_map.tensors.push_back(tensor); - tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1; - } - } -}; - -struct llama_file_saver { - llama_file file; - llama_file_loader * any_file_loader; - llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype) - : file(fname, "wb"), any_file_loader(any_file_loader) { - LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname); - write_magic(); - write_hparams(new_ftype); - write_vocab(); - } - void write_magic() { - file.write_u32(LLAMA_FILE_MAGIC); // magic - file.write_u32(LLAMA_FILE_VERSION); // version - } - void write_hparams(enum llama_ftype new_ftype) { - const llama_hparams & hparams = any_file_loader->hparams; - file.write_u32(hparams.n_vocab); - file.write_u32(hparams.n_embd); - file.write_u32(hparams.n_mult); - file.write_u32(hparams.n_head); - file.write_u32(hparams.n_layer); - file.write_u32(hparams.n_rot); - file.write_u32(new_ftype); - } - void write_vocab() { - if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { - LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); - } - uint32_t n_vocab = any_file_loader->hparams.n_vocab; - for (uint32_t i = 0; i < n_vocab; i++) { - const auto & token_score = any_file_loader->vocab.id_to_token.at(i); - file.write_u32((uint32_t) token_score.tok.size()); - file.write_raw(token_score.tok.data(), token_score.tok.size()); - file.write_raw(&token_score.score, sizeof(token_score.score)); - } - } - void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { - switch (new_type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - break; - default: LLAMA_ASSERT(false); - } - file.write_u32((uint32_t) tensor.ne.size()); - file.write_u32((uint32_t) tensor.name.size()); - file.write_u32(new_type); - file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); - file.write_raw(tensor.name.data(), tensor.name.size()); - file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); - LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); - file.write_raw(new_data, new_size); - } -}; - -struct llama_model_loader { - std::unique_ptr file_loader; - llama_load_tensors_map tensors_map; - bool use_mmap; - size_t num_ggml_tensors_created = 0; - struct ggml_context * ggml_ctx = NULL; - std::unique_ptr mapping; - - llama_model_loader(const std::string & fname_base, bool use_mmap) { - file_loader = std::unique_ptr(new llama_file_loader(fname_base.c_str(), tensors_map)); - if (!llama_mmap::SUPPORTED) { - use_mmap = false; - } - this->use_mmap = use_mmap; - } - - void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { - *ctx_size_p = *mmapped_size_p = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { - *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; - *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16; - } - } - - struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { - auto it = tensors_map.name_to_idx.find(name); - if (it == tensors_map.name_to_idx.end()) { - throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str()))); - } - llama_load_tensor & lt = tensors_map.tensors.at(it->second); - if (lt.ne != ne) { - throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", - name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str())); - } - - return get_tensor_for(lt, backend); - } - - struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) { - struct ggml_tensor * tensor; - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ggml_ctx, true); - } - if (lt.ne.size() == 2) { - tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); - } else { - LLAMA_ASSERT(lt.ne.size() == 1); - tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); - } - ggml_set_name(tensor, lt.name.c_str()); - LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor - - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ggml_ctx, use_mmap); - } - tensor->backend = backend; - lt.ggml_tensor = tensor; - num_ggml_tensors_created++; - return tensor; - } - - void done_getting_tensors() const { - if (num_ggml_tensors_created != tensors_map.tensors.size()) { - throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected")); - } - } - - void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { - size_t data_size = 0; - size_t prefetch_size = file_loader->file.size; - size_t lock_size = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { - data_size += lt.size; - if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { - prefetch_size -= lt.size; - } - } - - if (use_mmap) { - mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa())); - if (lmlock) { - lmlock->init(mapping->addr); - } - } - - size_t done_size = 0; - for (llama_load_tensor & lt : tensors_map.tensors) { - if (progress_callback) { - progress_callback((float) done_size / data_size, progress_callback_user_data); - } - LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already - lt.data = (uint8_t *) lt.ggml_tensor->data; - - // allocate temp buffer if not using mmap - if (!use_mmap && lt.data == NULL) { - GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU); - lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor)); - } - - load_data_for(lt); - - switch(lt.ggml_tensor->backend) { - case GGML_BACKEND_CPU: - lt.ggml_tensor->data = lt.data; - if (use_mmap && lmlock) { - lock_size += lt.size; - lmlock->grow_to(lock_size); - } - break; -#if defined(GGML_USE_CUBLAS) - case GGML_BACKEND_GPU: - case GGML_BACKEND_GPU_SPLIT: - ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); - if (!use_mmap) { - free(lt.data); - } - break; -#elif defined(GGML_USE_CLBLAST) - case GGML_BACKEND_GPU: - ggml_cl_transform_tensor(lt.data, lt.ggml_tensor); - if (!use_mmap) { - free(lt.data); - } - break; -#endif - default: - continue; - } - - done_size += lt.size; - } - } - - void load_data_for(llama_load_tensor & lt) { - if (use_mmap) { - lt.data = (uint8_t *) mapping->addr + lt.file_off; - } else { - llama_file & file = file_loader->file; - file.seek(lt.file_off, SEEK_SET); - file.read_raw(lt.data, lt.size); - } - - if (0) { - print_checksum(lt); - } - } - - static void print_checksum(llama_load_tensor & lt) { - uint32_t sum = 0; - for (size_t i = 0; i < lt.size; i++) { - uint8_t byte = lt.data[i]; - sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash - } - LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, - llama_format_tensor_shape(lt.ne).c_str(), lt.size); - } - -}; - // -// kv cache +// kv cache helpers // -static bool kv_cache_init( +static bool llama_kv_cache_init( const struct llama_hparams & hparams, struct llama_kv_cache & cache, ggml_type wtype, @@ -876,7 +957,7 @@ static bool kv_cache_init( struct ggml_init_params params; params.mem_size = cache.buf.size; - params.mem_buffer = cache.buf.addr; + params.mem_buffer = cache.buf.data; params.no_alloc = false; cache.ctx = ggml_init(params); @@ -904,102 +985,262 @@ static bool kv_cache_init( return true; } -struct llama_context_params llama_context_default_params() { - struct llama_context_params result = { - /*.seed =*/ LLAMA_DEFAULT_SEED, - /*.n_ctx =*/ 512, - /*.n_batch =*/ 512, - /*.n_gqa =*/ 1, - /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS, - /*.gpu_layers =*/ 0, - /*.main_gpu =*/ 0, - /*.tensor_split =*/ nullptr, - /*.rope_freq_base =*/ 10000.0f, - /*.rope_freq_scale =*/ 1.0f, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, - /*.low_vram =*/ false, - /*.mul_mat_q =*/ false, - /*.f16_kv =*/ true, - /*.logits_all =*/ false, - /*.vocab_only =*/ false, - /*.use_mmap =*/ true, - /*.use_mlock =*/ false, - /*.embedding =*/ false, - }; - - return result; -} - -struct llama_model_quantize_params llama_model_quantize_default_params() { - struct llama_model_quantize_params result = { - /*.nthread =*/ 0, - /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, - /*.allow_requantize =*/ false, - /*.quantize_output_tensor =*/ true, - }; - - return result; -} - -int llama_max_devices() { - return LLAMA_MAX_DEVICES; -} - -bool llama_mmap_supported() { - return llama_mmap::SUPPORTED; -} - -bool llama_mlock_supported() { - return llama_mlock::SUPPORTED; -} - -void llama_backend_init(bool numa) { - ggml_time_init(); - - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - if (numa) { - ggml_numa_init(); - } - -#ifdef GGML_USE_MPI - ggml_mpi_backend_init(); -#endif -} - -void llama_backend_free() { -#ifdef GGML_USE_MPI - ggml_mpi_backend_free(); -#endif -} - -int64_t llama_time_us() { - return ggml_time_us(); -} - // -// model loading +// model loading and saving // -static const char *llama_file_version_name(llama_file_version version) { +enum llama_file_version { + GGUF_FILE_VERSION_V1 = 1, +}; + +static const char * llama_file_version_name(llama_file_version version) { switch (version) { - case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; - case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; - case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; - case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; - case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; + case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)"; } return "unknown"; } -static const char *llama_ftype_name(enum llama_ftype ftype) { +static std::string llama_format_tensor_shape(const std::vector & ne) { + char buf[256]; + snprintf(buf, sizeof(buf), "%5u", ne.at(0)); + for (size_t i = 1; i < ne.size(); i++) { + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5u", ne.at(i)); + } + return buf; +} + +static std::string llama_format_tensor_shape(const struct ggml_tensor * t) { + char buf[256]; + snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]); + } + return buf; +} + +struct llama_model_loader { + int n_kv = 0; + int n_tensors = 0; + int n_created = 0; + + bool use_mmap = false; + + llama_file file; + llama_file_version file_version; + + std::unique_ptr mapping; + + struct gguf_context * ctx_gguf = NULL; + struct ggml_context * ctx_meta = NULL; + + llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + ctx_gguf = gguf_init_from_file(fname.c_str(), params); + + n_kv = gguf_get_n_kv(ctx_gguf); + n_tensors = gguf_get_n_tensors(ctx_gguf); + + file_version = (enum llama_file_version) gguf_get_version(ctx_gguf); + + // print meta data + // TODO: make optional + { + LLAMA_LOG_INFO("%s: loaded meta data with %d key-value paris and %d tensors from %s (version %s)\n", + __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(file_version)); + + for (int i = 0; i < n_tensors; i++) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name); + + LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); + } + + for (int i = 0; i < n_kv; i++) { + const char * name = gguf_get_key(ctx_gguf, i); + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + + LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type)); + } + } + + if (!llama_mmap::SUPPORTED) { + LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); + use_mmap = false; + } + + this->use_mmap = use_mmap; + } + + const char * get_tensor_name(int i) const { + return gguf_get_tensor_name(ctx_gguf, i); + } + + struct ggml_tensor * get_tensor_meta(int i) const { + return ggml_get_tensor(ctx_meta, get_tensor_name(i)); + } + + void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const { + ctx_size_p = 0; + mmapped_size_p = 0; + + for (int i = 0; i < n_tensors; i++) { + struct ggml_tensor * meta = get_tensor_meta(i); + ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta); + } + } + + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) { + if (backend != GGML_BACKEND_CPU) { + ggml_set_no_alloc(ctx, true); + } + + struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); + tensor->backend = backend; // TODO: ggml_set_backend + ggml_set_name(tensor, ggml_get_name(meta)); + + if (backend != GGML_BACKEND_CPU) { + ggml_set_no_alloc(ctx, use_mmap); + } + + n_created++; + + return tensor; + } + + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend backend) { + struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); + + { + bool is_ok = true; + for (size_t i = 0; i < ne.size(); ++i) { + if (ne[i] != cur->ne[i]) { + is_ok = false; + break; + } + } + if (!is_ok) { + throw std::runtime_error( + format("%s: tensor '%s' has wrong shape; expected %s, got %s", + __func__, name.c_str(), + llama_format_tensor_shape(ne).c_str(), + llama_format_tensor_shape(cur).c_str())); + } + } + + return create_tensor_for(ctx, cur, backend); + } + + void done_getting_tensors() const { + if (n_created != n_tensors) { + throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); + } + } + + size_t file_offset(const char * name) const { + const int idx = gguf_find_tensor(ctx_gguf, name); + + if (idx < 0) { + throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name)); + } + + return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); + } + + void load_data_for(struct ggml_tensor * cur) const { + const size_t offs = file_offset(ggml_get_name(cur)); + + if (use_mmap) { + cur->data = (uint8_t *) mapping->addr + offs; + } else { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); + } + } + + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + size_t size_data = 0; + size_t size_lock = 0; + size_t size_pref = 0; // prefetch + + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); + size_data += ggml_nbytes(cur); + if (cur->backend == GGML_BACKEND_CPU) { + size_pref += ggml_nbytes(cur); + } + } + + if (use_mmap) { + mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa())); + if (lmlock) { + lmlock->init(mapping->addr); + } + } + + size_t done_size = 0; + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); + GGML_ASSERT(cur); // unused tensors should have been caught by load_data already + + if (progress_callback) { + progress_callback((float) done_size / size_data, progress_callback_user_data); + } + + // allocate temp buffer if not using mmap + if (!use_mmap && cur->data == NULL) { + GGML_ASSERT(cur->backend != GGML_BACKEND_CPU); + cur->data = malloc(ggml_nbytes(cur)); + } + + load_data_for(cur); + + switch (cur->backend) { + case GGML_BACKEND_CPU: + if (use_mmap && lmlock) { + size_lock += ggml_nbytes(cur); + lmlock->grow_to(size_lock); + } + break; +#if defined(GGML_USE_CUBLAS) + case GGML_BACKEND_GPU: + case GGML_BACKEND_GPU_SPLIT: + // old code: + //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); + + // TODO: test if this works !! + ggml_cuda_transform_tensor(cur->data, cur); + if (!use_mmap) { + free(cur->data); + } + break; +#elif defined(GGML_USE_CLBLAST) + case GGML_BACKEND_GPU: + ggml_cl_transform_tensor(cur->data, cur); + if (!use_mmap) { + free(cur->data); + } + break; +#endif + default: + continue; + } + + done_size += ggml_nbytes(cur); + } + } +}; + +// +// load LLaMA models +// + +static const char * llama_ftype_name(enum llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; @@ -1010,8 +1251,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; + // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; @@ -1019,20 +1261,21 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; - default: return "unknown, may not work"; + case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; + + default: return "unknown, may not work"; } } -static const char *llama_model_type_name(e_model type) { +static const char * llama_model_type_name(e_model type) { switch (type) { - case MODEL_3B: return "3B"; - case MODEL_7B: return "7B"; + case MODEL_3B: return "3B"; + case MODEL_7B: return "7B"; case MODEL_13B: return "13B"; case MODEL_30B: return "30B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; - default: LLAMA_ASSERT(false); + default: GGML_ASSERT(false); } } @@ -1042,8 +1285,6 @@ static void llama_model_load_internal( llama_vocab & vocab, int n_ctx, int n_batch, - int n_gqa, - float rms_norm_eps, int n_gpu_layers, int main_gpu, const float * tensor_split, @@ -1057,27 +1298,46 @@ static void llama_model_load_internal( bool vocab_only, llama_progress_callback progress_callback, void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); - vocab = std::move(ml->file_loader->vocab); - - if (vocab_only) { - return; - } - - model.hparams = ml->file_loader->hparams; model.n_gpu_layers = n_gpu_layers; - llama_file_version file_version = ml->file_loader->file_version; auto & hparams = model.hparams; - // TODO: read from file - hparams.f_rms_norm_eps = rms_norm_eps; - + // read hparams { + struct gguf_context * ctx = ml->ctx_gguf; + +#define GGUF_GET(dst, func, type, req, key) \ + { \ + const int kid = gguf_find_key(ctx, key); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + throw std::runtime_error(format("key %s has wrong type: %s", key, gguf_type_name(ktype))); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + throw std::runtime_error(format("key not found in model: %s", key)); \ + } \ + } + + GGUF_GET(hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens"); + GGUF_GET(hparams.n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length"); + GGUF_GET(hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length"); + GGUF_GET(hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length"); + GGUF_GET(hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count"); + GGUF_GET(hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count"); + GGUF_GET(hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count"); + GGUF_GET(hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon"); + + // n_head_kv is optional, default to n_head + hparams.n_head_kv = hparams.n_head; + GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv"); +#undef GGUF_GET + switch (hparams.n_layer) { case 26: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_7B; break; @@ -1095,78 +1355,91 @@ static void llama_model_load_internal( hparams.n_ctx = n_ctx; // LLaMAv2 - // TODO: temporary until GGUF - LLAMA_ASSERT(hparams.n_head % n_gqa == 0); - hparams.n_head_kv = hparams.n_head / n_gqa; - if (model.type == e_model::MODEL_65B && n_gqa == 8) { - LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); - model.type = e_model::MODEL_70B; - hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model + // TODO: probably not needed + { + const auto n_gqa = hparams.n_gqa(); + + if (model.type == e_model::MODEL_65B && n_gqa == 8) { + LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa); + model.type = e_model::MODEL_70B; + } } hparams.rope_freq_base = rope_freq_base; hparams.rope_freq_scale = rope_freq_scale; } - // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199 - const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3; - const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw; - const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - //const uint32_t n_ff = 28672; + // read vocab + { + struct gguf_context * ctx = ml->ctx_gguf; + + vocab.id_to_token.resize(hparams.n_vocab); + + const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens"); + if (token_idx == -1) { + throw std::runtime_error("cannot find token list in GGUF file\n"); + } + + const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores"); + if (score_idx == -1) { + throw std::runtime_error("cannot find token scores list in GGUF file\n"); + } + + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + for (uint32_t i = 0; i < hparams.n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab.token_to_id[word] = i; + + auto & tok_score = vocab.id_to_token[i]; + tok_score.tok = std::move(word); + tok_score.score = scores[i]; + } + } { - LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(file_version)); + LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml->file_version)); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); - LLAMA_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult); LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); - LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff); + LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); + LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } - if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { - if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && - hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && - hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { - throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)")); - } - } - - if (file_version < LLAMA_FILE_VERSION_GGJT_V3) { - if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || - hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || - hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { - throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)")); - } + if (vocab_only) { + return; } auto & ctx = model.ctx; size_t ctx_size; size_t mmapped_size; - ml->calc_sizes(&ctx_size, &mmapped_size); + + ml->calc_sizes(ctx_size, mmapped_size); + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { model.buf.resize(ctx_size); if (use_mlock) { - model.mlock_buf.init (model.buf.addr); + model.mlock_buf.init (model.buf.data); model.mlock_buf.grow_to(model.buf.size); } struct ggml_init_params params = { /*.mem_size =*/ model.buf.size, - /*.mem_buffer =*/ model.buf.addr, + /*.mem_buffer =*/ model.buf.data, /*.no_alloc =*/ ml->use_mmap, }; @@ -1202,9 +1475,7 @@ static void llama_model_load_internal( const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; - ml->ggml_ctx = ctx; - - model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embeddings = ml->create_tensor(ctx, TN_TOKEN_EMBD, {n_embd, n_vocab}, GGML_BACKEND_CPU); // "output" tensor { @@ -1225,8 +1496,8 @@ static void llama_model_load_internal( backend_output = GGML_BACKEND_CPU; } - model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm); - model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); + model.norm = ml->create_tensor(ctx, TN_OUTPUT_NORM, {n_embd}, backend_norm); + model.output = ml->create_tensor(ctx, TN_OUTPUT, {n_embd, n_vocab}, backend_output); if (backend_norm == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(model.norm); } @@ -1235,6 +1506,8 @@ static void llama_model_load_internal( } } + const uint32_t n_ff = hparams.n_ff; + const int i_gpu_start = n_layer - n_gpu_layers; model.layers.resize(n_layer); @@ -1243,21 +1516,18 @@ static void llama_model_load_internal( const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT auto & layer = model.layers[i]; + layer.attention_norm = ml->create_tensor(ctx, format(TN_ATTN_NORM, i), {n_embd}, backend); - std::string layers_i = "layers." + std::to_string(i); + layer.wq = ml->create_tensor(ctx, format(TN_ATTN_Q, i), {n_embd, n_embd}, backend_split); + layer.wk = ml->create_tensor(ctx, format(TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml->create_tensor(ctx, format(TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml->create_tensor(ctx, format(TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split); - layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); + layer.ffn_norm = ml->create_tensor(ctx, format(TN_FFN_NORM, i), {n_embd}, backend); - layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); - layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); - - layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); - - layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split); - layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split); - layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split); + layer.w1 = ml->create_tensor(ctx, format(TN_FFN_GATE, i), {n_embd, n_ff}, backend_split); + layer.w2 = ml->create_tensor(ctx, format(TN_FFN_DOWN, i), { n_ff, n_embd}, backend_split); + layer.w3 = ml->create_tensor(ctx, format(TN_FFN_UP, i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -1355,8 +1625,9 @@ static void llama_model_load_internal( } // populate `tensors_by_name` - for (llama_load_tensor & lt : ml->tensors_map.tensors) { - model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); + for (int i = 0; i < ml->n_tensors; ++i) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, ml->get_tensor_name(i)); + model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } (void) tensor_split; @@ -1366,7 +1637,7 @@ static void llama_model_load_internal( } #endif - ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); + ml->load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); @@ -1385,8 +1656,6 @@ static bool llama_model_load( llama_vocab & vocab, int n_ctx, int n_batch, - int n_gqa, - float rms_norm_eps, int n_gpu_layers, int main_gpu, const float * tensor_split, @@ -1401,7 +1670,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; @@ -1418,7 +1687,7 @@ static struct ggml_cgraph * llama_build_graph( int n_tokens, int n_past) { - LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT const int N = n_tokens; @@ -1427,7 +1696,7 @@ static struct ggml_cgraph * llama_build_graph( const auto & kv_self = lctx.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); + GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; @@ -1437,21 +1706,20 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); - LLAMA_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_rot); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float rms_norm_eps = hparams.f_rms_norm_eps; + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_rms_eps = hparams.f_norm_rms_eps; const int n_gpu_layers = model.n_gpu_layers; auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; - struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.addr, + /*.mem_buffer =*/ buf_compute.data, /*.no_alloc =*/ false, }; @@ -1549,7 +1817,7 @@ static struct ggml_cgraph * llama_build_graph( // norm { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); offload_func(cur); ggml_set_name(cur, "rms_norm_0"); @@ -1694,7 +1962,7 @@ static struct ggml_cgraph * llama_build_graph( { // norm { - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); offload_func(cur); ggml_set_name(cur, "rms_norm_1"); @@ -1744,7 +2012,7 @@ static struct ggml_cgraph * llama_build_graph( // norm { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); offload_func_nr(cur); ggml_set_name(cur, "rms_norm_2"); @@ -1801,7 +2069,7 @@ static bool llama_eval_internal( int n_threads, const char * cgraph_fname) { - LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT const int64_t t_start_us = ggml_time_us(); @@ -1816,7 +2084,7 @@ static bool llama_eval_internal( const auto & kv_self = lctx.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); + GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; @@ -1840,8 +2108,8 @@ static bool llama_eval_internal( struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - LLAMA_ASSERT(strcmp(res->name, "result_output") == 0); - LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + GGML_ASSERT(strcmp(res->name, "result_output") == 0); + GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); #if GGML_USE_MPI const int64_t n_layer = hparams.n_layer; @@ -1997,15 +2265,15 @@ static bool llama_is_eos_token(const llama_vocab & vocab, llama_token token) { } static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token token) { - UNUSED(vocab); - UNUSED(token); + GGML_UNUSED(vocab); + GGML_UNUSED(token); // TODO: improve? return false; } static bool llama_is_unused_token(const llama_vocab & vocab, llama_token token) { - UNUSED(vocab); - UNUSED(token); + GGML_UNUSED(vocab); + GGML_UNUSED(token); // TODO: improve? return false; } @@ -2024,7 +2292,7 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token token) { static uint8_t llama_byte_to_char(const llama_vocab & vocab, uint8_t byte) { if (llama_vocab_type(vocab) == "spm") { - return byte + 3; + return byte - 3; } if (llama_vocab_type(vocab) == "bpe") { @@ -2278,8 +2546,8 @@ std::vector decode_utf8(const char * src) { // returns true iff pos points to the end of one of the definitions of a rule static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) { switch (pos->type) { - case LLAMA_GRETYPE_END: return true; - case LLAMA_GRETYPE_ALT: return true; + case LLAMA_GRETYPE_END: return true; // NOLINT + case LLAMA_GRETYPE_ALT: return true; // NOLINT default: return false; } } @@ -2292,7 +2560,7 @@ static std::pair llama_grammar_match_char( bool found = false; bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; - LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); + GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT do { if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) { @@ -2360,7 +2628,7 @@ static void llama_grammar_advance_stack( // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on // those - LLAMA_ASSERT(false); + GGML_ASSERT(false); } } @@ -2427,7 +2695,7 @@ static std::vector llama_grammar_reject_candidates_for_ } } - auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second; + const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second; // update top of stack to next element, if any std::vector stack_after(stack.begin(), stack.end() - 1); @@ -2449,7 +2717,7 @@ static std::vector llama_grammar_reject_candidates( const std::vector> & rules, const std::vector> & stacks, const std::vector & candidates) { - LLAMA_ASSERT(!stacks.empty()); // REVIEW + GGML_ASSERT(!stacks.empty()); // REVIEW if (candidates.empty()) { return std::vector(); @@ -2660,7 +2928,6 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * } } - void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { // Reference implementation: // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr @@ -2828,9 +3095,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - const auto rejects = - llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); - for (auto & reject : rejects) { + const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); + for (const auto & reject : rejects) { candidates->data[reject.index].logit = -INFINITY; } @@ -3021,7 +3287,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar return; } } - LLAMA_ASSERT(false); + GGML_ASSERT(false); } std::string str = llama_token_to_str(ctx, token); @@ -3030,7 +3296,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); } - LLAMA_ASSERT(!grammar->stacks.empty()); + GGML_ASSERT(!grammar->stacks.empty()); ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } @@ -3039,37 +3305,37 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar // quantization // -static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) { - if (output.size < nelements * sizeof(float)) { - output.resize(nelements * sizeof(float)); +static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector & output, const size_t nelements, const int nthread) { + if (output.size() < nelements) { + output.resize(nelements); } - float * f32_output = (float *) output.addr; + float * f32_output = (float *) output.data(); ggml_type_traits_t qtype; - if (ggml_is_quantized(tensor.type)) { - qtype = ggml_internal_get_type_traits(tensor.type); + if (ggml_is_quantized(tensor->type)) { + qtype = ggml_internal_get_type_traits(tensor->type); if (qtype.to_float == NULL) { - throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); + throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); } - } else if (tensor.type != GGML_TYPE_F16) { - throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); + } else if (tensor->type != GGML_TYPE_F16) { + throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type))); } if (nthread < 2) { - if (tensor.type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements); - } else if (ggml_is_quantized(tensor.type)) { - qtype.to_float(tensor.data, f32_output, nelements); + if (tensor->type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); + } else if (ggml_is_quantized(tensor->type)) { + qtype.to_float(tensor->data, f32_output, nelements); } else { - LLAMA_ASSERT(false); // unreachable + GGML_ASSERT(false); // unreachable } return; } - auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type); - auto block_size_bytes = ggml_type_size(tensor.type); + auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type); + auto block_size_bytes = ggml_type_size(tensor->type); - LLAMA_ASSERT(nelements % block_size == 0); + GGML_ASSERT(nelements % block_size == 0); auto nblocks = nelements / block_size; auto blocks_per_thread = nblocks / nthread; auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count @@ -3087,14 +3353,13 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam qtype.to_float(inbuf, outbuf, nels); } }; - workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); + workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); in_buff_offs += thr_block_bytes; out_buff_offs += thr_elems; } for (auto & worker : workers) { worker.join(); } - } static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { @@ -3131,16 +3396,27 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false)); - llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype); + + const size_t align = GGUF_DEFAULT_ALIGNMENT; + struct gguf_context * ctx_out = gguf_init_empty(); + + // copy the KV pairs from the input file + gguf_set_kv (ctx_out, model_loader->ctx_gguf); + gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); #ifdef GGML_USE_K_QUANTS int n_attention_wv = 0; int n_feed_forward_w2 = 0; - for (auto& tensor : model_loader->tensors_map.tensors) { - if (tensor.name.find("attention.wv.weight") != std::string::npos) { + + for (int i = 0; i < model_loader->n_tensors; ++i) { + struct ggml_tensor * meta = model_loader->get_tensor_meta(i); + + const std::string name = ggml_get_name(meta); + + if (name.find("attn_v.weight") != std::string::npos) { ++n_attention_wv; } - else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + else if (name.find("ffn_down.weight") != std::string::npos) { ++n_feed_forward_w2; } } @@ -3160,46 +3436,68 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; }; - size_t idx = 0; - for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { - llama_buffer read_data; - read_data.resize(tensor.size); - tensor.data = read_data.addr; + int idx = 0; + + std::vector read_data; + std::vector work; + + // populate the original tensors so we get an initial meta data + for (int i = 0; i < model_loader->n_tensors; ++i) { + struct ggml_tensor * meta = model_loader->get_tensor_meta(i); + gguf_add_tensor(ctx_out, meta); + } + + std::ofstream fout(fname_out, std::ios::binary); + + const size_t meta_size = gguf_get_meta_size(ctx_out); + + LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); + + // placeholder for the meta data + ::zeros(fout, meta_size); + + for (int i = 0; i < model_loader->n_tensors; ++i) { + struct ggml_tensor * tensor = model_loader->get_tensor_meta(i); + + const std::string name = ggml_get_name(tensor); + + read_data.resize(ggml_nbytes(tensor)); + tensor->data = read_data.data(); model_loader->load_data_for(tensor); - LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ", - ++idx, model_loader->tensors_map.tensors.size(), - tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), - ggml_type_name(tensor.type)); + LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", + ++idx, model_loader->n_tensors, + ggml_get_name(tensor), + llama_format_tensor_shape(tensor).c_str(), + ggml_type_name(tensor->type)); // This used to be a regex, but has an extreme cost to compile times. - bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? + bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? // quantize only 2D tensors - quantize &= (tensor.ne.size() == 2); - quantize &= params->quantize_output_tensor || tensor.name != "output.weight"; - quantize &= quantized_type != tensor.type; + quantize &= (tensor->n_dims == 2); + quantize &= params->quantize_output_tensor || name != "output.weight"; + quantize &= quantized_type != tensor->type; enum ggml_type new_type; void * new_data; size_t new_size; - llama_buffer work; if (!quantize) { - new_type = tensor.type; - new_data = tensor.data; - new_size = tensor.size; - LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + new_type = tensor->type; + new_data = tensor->data; + new_size = ggml_nbytes(tensor); + LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); } else { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS - if (tensor.name == "output.weight") { - int nx = tensor.ne.at(0); - int ny = tensor.ne.at(1); + if (name == TN_OUTPUT) { + int nx = tensor->ne[0]; + int ny = tensor->ne[1]; if (nx % QK_K == 0 && ny % QK_K == 0) { new_type = GGML_TYPE_Q6_K; } - } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { + } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && @@ -3207,32 +3505,32 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; ++i_attention_wv; - } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + } else if (name.find("feed_forward.w2.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; ++i_feed_forward_w2; - } else if (tensor.name.find("attention.wo.weight") != std::string::npos) { + } else if (name.find("attn_output.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } bool convert_incompatible_tensor = false; if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { - int nx = tensor.ne.at(0); - int ny = tensor.ne.at(1); + int nx = tensor->ne[0]; + int ny = tensor->ne[1]; if (nx % QK_K != 0 || ny % QK_K != 0) { LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); convert_incompatible_tensor = true; } } if (convert_incompatible_tensor) { - if (tensor.name == "output.weight") { + if (name == TN_OUTPUT) { new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); - } else if (tensor.name == "tok_embeddings.weight") { + } else if (name == TN_TOKEN_EMBD) { new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); } else { @@ -3241,27 +3539,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } #endif - float * f32_data; - size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); - llama_buffer f32_conv_buf; + const size_t nelements = ggml_nelements(tensor); - if (tensor.type == GGML_TYPE_F32) { - f32_data = (float *) tensor.data; - } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) { - throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type))); + float * f32_data; + std::vector f32_conv_buf; + + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { + throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); } else { llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); - f32_data = (float *) f32_conv_buf.addr; + f32_data = (float *) f32_conv_buf.data(); } LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type)); fflush(stdout); work.resize(nelements * 4); // upper bound on size - new_data = work.addr; + new_data = work.data(); std::vector hist_cur(1 << 4, 0); - int chunk_size = 32 * 512; + const int chunk_size = 32 * 512; const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; if (nthread_use < 2) { @@ -3269,7 +3568,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { size_t counter = 0; new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () { std::vector local_hist; size_t local_size = 0; while (true) { @@ -3304,7 +3603,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } - LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); int64_t tot_count = 0; for (size_t i = 0; i < hist_cur.size(); i++) { hist_all[i] += hist_cur[i]; @@ -3318,14 +3617,34 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } LLAMA_LOG_INFO("\n"); } - total_size_org += tensor.size; + total_size_org += ggml_nbytes(tensor); total_size_new += new_size; - file_saver.write_tensor(tensor, new_type, new_data, new_size); + + // update the gguf meta data as we go + gguf_set_tensor_type(ctx_out, name.c_str(), new_type); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + + // write tensor data + padding + fout.write((const char *) new_data, new_size); + zeros(fout, GGML_PAD(new_size, align) - new_size); } + // go back to beginning of file and write the updated meta data + { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *) data.data(), data.size()); + } + + fout.close(); + + gguf_free(ctx_out); + LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + // print histogram for all tensors { int64_t sum_all = 0; for (size_t i = 0; i < hist_all.size(); i++) { @@ -3342,230 +3661,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } - - -// -// interface implementation -// - -struct llama_model * llama_load_model_from_file( - const char * path_model, - struct llama_context_params params) { - ggml_time_init(); - - llama_model * model = new llama_model; - - ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram, - memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, - params.progress_callback_user_data)) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); - delete model; - return nullptr; - } - - return model; -} - -void llama_free_model(struct llama_model * model) { - delete model; -} - -struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params) { - - if (!model) { - return nullptr; - } - - llama_context * ctx = new llama_context(*model); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - unsigned cur_percentage = 0; - if (params.progress_callback == NULL) { - params.progress_callback_user_data = &cur_percentage; - params.progress_callback = [](float progress, void * ctx) { - unsigned * cur_percentage_p = (unsigned *) ctx; - unsigned percentage = (unsigned) (100 * progress); - while (percentage > *cur_percentage_p) { - *cur_percentage_p = percentage; - LLAMA_LOG_INFO("."); - if (percentage >= 100) { - LLAMA_LOG_INFO("\n"); - } - } - }; - } - - ctx->rng = std::mt19937(params.seed); - ctx->logits_all = params.logits_all; - - ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - - // reserve memory for context buffers - if (!params.vocab_only) { - if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { - LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__); - llama_free(ctx); - return nullptr; - } - - { - const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); - LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); - } - - const auto & hparams = ctx->model.hparams; - - // resized during inference - if (params.logits_all) { - ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); - } else { - ctx->logits.reserve(hparams.n_vocab); - } - - if (params.embedding){ - ctx->embedding.resize(hparams.n_embd); - } - -#ifdef LLAMA_USE_ALLOCATOR - { - static const size_t tensor_alignment = 32; - // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data - ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); - - // create measure allocator - ctx->alloc = ggml_allocr_new_measure(tensor_alignment); - - // build worst-case graph - int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); - int n_past = hparams.n_ctx - n_tokens; - llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); - - // measure memory requirements for the graph - size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - - LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); - - // debug - for comparison with scratch buffer - //size_t prev_req = - // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + - // MEM_REQ_SCRATCH1().at(ctx->model.type) + - // MEM_REQ_EVAL().at(ctx->model.type); - //LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); - - // recreate allocator with exact memory requirements - ggml_allocr_free(ctx->alloc); - - ctx->buf_alloc.resize(alloc_size); - ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); - } -#else - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); -#endif - -#ifdef LLAMA_USE_SCRATCH - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); - ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); -#endif - } - -#ifdef GGML_USE_METAL - if (params.n_gpu_layers > 0) { - // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(1); - - if (!ctx->ctx_metal) { - LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); - llama_free(ctx); - return NULL; - } - - void * data_ptr = NULL; - size_t data_size = 0; - - if (params.use_mmap) { - data_ptr = ctx->model.mapping->addr; - data_size = ctx->model.mapping->size; - } else { - data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - data_size = ggml_get_mem_size (ctx->model.ctx); - } - - const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - - LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); - -#define LLAMA_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ - llama_free(ctx); \ - return NULL; \ - } - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); -#undef LLAMA_METAL_CHECK_BUF - } -#endif - -#ifdef GGML_USE_MPI - ctx->ctx_mpi = ggml_mpi_init(); - - if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { - // Enter a blocking eval loop with dummy input, letting rank=0 drive the process - const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos()); - while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; - llama_backend_free(); - exit(1); - } -#endif - - return ctx; -} - -struct llama_context * llama_init_from_file( - const char * path_model, - struct llama_context_params params) { - - struct llama_model * model = llama_load_model_from_file(path_model, params); - if (!model) { - return nullptr; - } - struct llama_context * ctx = llama_new_context_with_model(model, params); - ctx->model_owner = true; - return ctx; -} - -void llama_free(struct llama_context * ctx) { - delete ctx; -} - -int llama_model_quantize( - const char * fname_inp, - const char * fname_out, - const llama_model_quantize_params *params) { - try { - llama_model_quantize_internal(fname_inp, fname_out, params); - return 0; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); - return 1; - } -} - int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -3581,10 +3676,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); - return 1; - } uint32_t format_version; fin.read((char *) &format_version, sizeof(format_version)); @@ -3602,7 +3693,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - // create a temporary ggml context to store the lora tensors // todo: calculate size from biggest possible tensor std::vector lora_buf(1024ull * 1024ull * 1024ull); @@ -3616,36 +3706,33 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // create a name -> tensor map of the model to accelerate lookups std::unordered_map model_tensors; - for (const auto & kv: model.tensors_by_name) { + for (const auto & kv : model.tensors_by_name) { model_tensors.insert(kv); } - // load base model std::unique_ptr model_loader; ggml_context * base_ctx = NULL; - llama_buffer base_buf; + std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); size_t ctx_size; size_t mmapped_size; - model_loader->calc_sizes(&ctx_size, &mmapped_size); + model_loader->calc_sizes(ctx_size, mmapped_size); base_buf.resize(ctx_size); ggml_init_params base_params; - base_params.mem_size = base_buf.size; - base_params.mem_buffer = base_buf.addr; + base_params.mem_size = base_buf.size(); + base_params.mem_buffer = base_buf.data(); base_params.no_alloc = model_loader->use_mmap; base_ctx = ggml_init(base_params); - model_loader->ggml_ctx = base_ctx; - // maybe this should in llama_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa())); + model_loader->mapping.reset(new llama_mmap(&model_loader->file, /* prefetch */ 0, ggml_is_numa())); } } @@ -3750,19 +3837,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const ggml_tensor * base_t; if (model_loader) { + struct gguf_context * ctx_gguf = model_loader->ctx_gguf; + // load from base model - if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { + if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } - size_t idx = model_loader->tensors_map.name_to_idx[base_name]; - llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; - base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); - lt.data = (uint8_t *) lt.ggml_tensor->data; - model_loader->load_data_for(lt); - lt.ggml_tensor->data = lt.data; - } - else { + + // TODO: not tested!! maybe not working! + base_t = model_loader->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + model_loader->load_data_for(base_t); + } else { base_t = dest_t; } @@ -3846,6 +3932,305 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const return 0; } +// +// interface implementation +// + +struct llama_context_params llama_context_default_params() { + struct llama_context_params result = { + /*.seed =*/ LLAMA_DEFAULT_SEED, + /*.n_ctx =*/ 512, + /*.n_batch =*/ 512, + /*.gpu_layers =*/ 0, + /*.main_gpu =*/ 0, + /*.tensor_split =*/ nullptr, + /*.rope_freq_base =*/ 10000.0f, + /*.rope_freq_scale =*/ 1.0f, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, + /*.low_vram =*/ false, + /*.mul_mat_q =*/ false, + /*.f16_kv =*/ true, + /*.logits_all =*/ false, + /*.vocab_only =*/ false, + /*.use_mmap =*/ true, + /*.use_mlock =*/ false, + /*.embedding =*/ false, + }; + + return result; +} + +struct llama_model_quantize_params llama_model_quantize_default_params() { + struct llama_model_quantize_params result = { + /*.nthread =*/ 0, + /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, + /*.allow_requantize =*/ false, + /*.quantize_output_tensor =*/ true, + }; + + return result; +} + +int llama_max_devices(void) { + return LLAMA_MAX_DEVICES; +} + +bool llama_mmap_supported(void) { + return llama_mmap::SUPPORTED; +} + +bool llama_mlock_supported(void) { + return llama_mlock::SUPPORTED; +} + +void llama_backend_init(bool numa) { + ggml_time_init(); + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + + if (numa) { + ggml_numa_init(); + } + +#ifdef GGML_USE_MPI + ggml_mpi_backend_init(); +#endif +} + +void llama_backend_free(void) { +#ifdef GGML_USE_MPI + ggml_mpi_backend_free(); +#endif +} + +int64_t llama_time_us(void) { + return ggml_time_us(); +} + +struct llama_model * llama_load_model_from_file( + const char * path_model, + struct llama_context_params params) { + ggml_time_init(); + + llama_model * model = new llama_model; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, + params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, + params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, + params.progress_callback, params.progress_callback_user_data)) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + delete model; + return nullptr; + } + + return model; +} + +void llama_free_model(struct llama_model * model) { + delete model; +} + +struct llama_context * llama_new_context_with_model( + struct llama_model * model, + struct llama_context_params params) { + + if (!model) { + return nullptr; + } + + llama_context * ctx = new llama_context(*model); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + *cur_percentage_p = percentage; + LLAMA_LOG_INFO("."); + if (percentage >= 100) { + LLAMA_LOG_INFO("\n"); + } + } + }; + } + + ctx->rng = std::mt19937(params.seed); + ctx->logits_all = params.logits_all; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + // reserve memory for context buffers + if (!params.vocab_only) { + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { + LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); + llama_free(ctx); + return nullptr; + } + + { + const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); + LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + } + + const auto & hparams = ctx->model.hparams; + + // resized during inference + if (params.logits_all) { + ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); + } else { + ctx->logits.reserve(hparams.n_vocab); + } + + if (params.embedding){ + ctx->embedding.resize(hparams.n_embd); + } + +#ifdef LLAMA_USE_ALLOCATOR + { + static const size_t tensor_alignment = 32; + // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data + ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + + // create measure allocator + ctx->alloc = ggml_allocr_new_measure(tensor_alignment); + + // build worst-case graph + int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); + int n_past = hparams.n_ctx - n_tokens; + llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); + + // measure memory requirements for the graph + size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; + + LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + + // debug - for comparison with scratch buffer + //size_t prev_req = + // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + + // MEM_REQ_SCRATCH1().at(ctx->model.type) + + // MEM_REQ_EVAL().at(ctx->model.type); + //LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); + + // recreate allocator with exact memory requirements + ggml_allocr_free(ctx->alloc); + + ctx->buf_alloc.resize(alloc_size); + ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment); + } +#else + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); +#endif + +#ifdef LLAMA_USE_SCRATCH + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); + ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); +#endif + } + +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + // this allocates all Metal resources and memory buffers + ctx->ctx_metal = ggml_metal_init(1); + + if (!ctx->ctx_metal) { + LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); + llama_free(ctx); + return NULL; + } + + void * data_ptr = NULL; + size_t data_size = 0; + + if (params.use_mmap) { + data_ptr = ctx->model.mapping->addr; + data_size = ctx->model.mapping->size; + } else { + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size = ggml_get_mem_size (ctx->model.ctx); + } + + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); + + LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + +#define LLAMA_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ + llama_free(ctx); \ + return NULL; \ + } + + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); + + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); + + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].data, ctx->buf_scratch[0].size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].data, ctx->buf_scratch[1].size, 0)); +#undef LLAMA_METAL_CHECK_BUF + } +#endif + +#ifdef GGML_USE_MPI + ctx->ctx_mpi = ggml_mpi_init(); + + if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { + // Enter a blocking eval loop with dummy input, letting rank=0 drive the process + const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos()); + while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; + llama_backend_free(); + exit(1); + } +#endif + + return ctx; +} + +struct llama_context * llama_init_from_file( + const char * path_model, + struct llama_context_params params) { + + struct llama_model * model = llama_load_model_from_file(path_model, params); + if (!model) { + return nullptr; + } + struct llama_context * ctx = llama_new_context_with_model(model, params); + ctx->model_owner = true; + return ctx; +} + +void llama_free(struct llama_context * ctx) { + delete ctx; +} + +int llama_model_quantize( + const char * fname_inp, + const char * fname_out, + const llama_model_quantize_params * params) { + try { + llama_model_quantize_internal(fname_inp, fname_out, params); + return 0; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); + return 1; + } +} + int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); @@ -3908,6 +4293,46 @@ size_t llama_get_state_size(const struct llama_context * ctx) { return s_total; } +// llama_context_data +struct llama_data_context { + virtual void write(const void * src, size_t size) = 0; + virtual size_t get_size_written() = 0; + virtual ~llama_data_context() = default; +}; + +struct llama_data_buffer_context : llama_data_context { + uint8_t * ptr; + size_t size_written = 0; + + llama_data_buffer_context(uint8_t * p) : ptr(p) {} + + void write(const void * src, size_t size) override { + memcpy(ptr, src, size); + ptr += size; + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_file_context : llama_data_context { + llama_file * file; + size_t size_written = 0; + + llama_data_file_context(llama_file * f) : file(f) {} + + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + /** copy state data into either a buffer or file depending on the passed in context * * file context: @@ -4041,7 +4466,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss >> ctx->rng; - LLAMA_ASSERT(rng_ss.fail() == false); + GGML_ASSERT(rng_ss.fail() == false); } // set logits @@ -4052,7 +4477,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - LLAMA_ASSERT(ctx->logits.capacity() == logits_cap); + GGML_ASSERT(ctx->logits.capacity() == logits_cap); if (logits_size) { ctx->logits.resize(logits_size); @@ -4068,7 +4493,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); - LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size); + GGML_ASSERT(ctx->embedding.capacity() == embedding_size); if (embedding_size) { memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); @@ -4091,7 +4516,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); if (kv_size) { - LLAMA_ASSERT(kv_self.buf.size == kv_size); + GGML_ASSERT(kv_self.buf.size == kv_size); const size_t elt_size = ggml_element_size(kv_self.k); @@ -4127,7 +4552,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const size_t nread = inp - src; const size_t max_size = llama_get_state_size(ctx); - LLAMA_ASSERT(nread <= max_size); + GGML_ASSERT(nread <= max_size); return nread; } @@ -4235,7 +4660,6 @@ int llama_eval( return 0; } - int llama_eval_embd( struct llama_context * ctx, const float * embd, @@ -4410,23 +4834,26 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * str, int length) { +// does not write null-terminator to str +int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { if (0 <= token && token < llama_n_vocab_from_model(model)) { if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].tok; - if(llama_vocab_type(model->vocab) == "spm") { + if (llama_vocab_type(model->vocab) == "spm") { result = llama_unescape_whitespace(result); } if (length < (int) result.length()) { return -result.length(); } - strncpy(str, result.c_str(), result.length()); + memcpy(buf, result.c_str(), result.length()); return result.length(); - } else if (llama_is_unknown_token(model->vocab, token)) { + } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT if (length < 3) { return -3; } - strncpy(str, "\xe2\x96\x85", 3); + buf[0] = '\xe2'; + buf[1] = '\x96'; + buf[2] = '\x85'; return 3; } else if (llama_is_control_token(model->vocab, token)) { ; @@ -4434,8 +4861,7 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token if (length < 1) { return -1; } - str[0] = llama_byte_to_char(model->vocab, token); - str[1] = 0x00; + buf[0] = llama_byte_to_char(model->vocab, token); return 1; } } @@ -4466,7 +4892,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, if (length < (int) result.length()) { return -result.length(); } - strncpy(str, result.c_str(), result.length()); + memcpy(str, result.c_str(), result.length()); return result.length(); } return 0; @@ -4486,15 +4912,15 @@ std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token return std::string(result.data(), result.size()); } -llama_token llama_token_bos() { +llama_token llama_token_bos(void) { return 1; } -llama_token llama_token_eos() { +llama_token llama_token_eos(void) { return 2; } -llama_token llama_token_nl() { +llama_token llama_token_nl(void) { return 13; } @@ -4563,7 +4989,6 @@ const std::vector>& llama_internal_ return ctx->model.tensors_by_name; } - void llama_log_set(llama_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; diff --git a/llama.h b/llama.h index 6ed1c4b91..2e407a1db 100644 --- a/llama.h +++ b/llama.h @@ -34,29 +34,18 @@ # define DEPRECATED(func, hint) func #endif -#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' -#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' -#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' -#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' -#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +#define LLAMA_DEFAULT_SEED 0xFFFFFFFF -#define LLAMA_FILE_VERSION 3 -#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT -#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML -#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 1 +#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' -#define LLAMA_DEFAULT_SEED 0xFFFFFFFF +#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN +#define LLAMA_SESSION_VERSION 1 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif -#ifndef LLAMA_DEFAULT_RMS_EPS -#define LLAMA_DEFAULT_RMS_EPS 5e-6f -#endif - #ifdef __cplusplus extern "C" { #endif @@ -103,8 +92,6 @@ extern "C" { uint32_t seed; // RNG seed, -1 for random int32_t n_ctx; // text context int32_t n_batch; // prompt processing batch size - int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) - float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams) int32_t n_gpu_layers; // number of layers to store in VRAM int32_t main_gpu; // the GPU that is used for scratch and small tensors @@ -129,6 +116,7 @@ extern "C" { bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only }; + // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0, @@ -155,7 +143,7 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype + enum llama_ftype ftype; // quantize to this llama_ftype bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight } llama_model_quantize_params; @@ -208,17 +196,12 @@ extern "C" { int32_t n_eval; }; - // Set callback for all future logging events. - // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + LLAMA_API struct llama_context_params llama_context_default_params(void); + LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); - LLAMA_API int llama_max_devices(); - - LLAMA_API struct llama_context_params llama_context_default_params(); - LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); - - LLAMA_API bool llama_mmap_supported(); - LLAMA_API bool llama_mlock_supported(); + LLAMA_API int llama_max_devices(void); + LLAMA_API bool llama_mmap_supported(void); + LLAMA_API bool llama_mlock_supported(void); // TODO: not great API - very likely to change // Initialize the llama + ggml backend @@ -226,9 +209,9 @@ extern "C" { // Call once at the start of the program LLAMA_API void llama_backend_init(bool numa); // Call once at the end of the program - currently only used for MPI - LLAMA_API void llama_backend_free(); + LLAMA_API void llama_backend_free(void); - LLAMA_API int64_t llama_time_us(); + LLAMA_API int64_t llama_time_us(void); LLAMA_API struct llama_model * llama_load_model_from_file( const char * path_model, @@ -240,13 +223,6 @@ extern "C" { struct llama_model * model, struct llama_context_params params); - // Various functions for loading a ggml llama model. - // Allocate (almost) all memory needed for the model. - // Return NULL on failure - LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file( - const char * path_model, - struct llama_context_params params), - "please use llama_load_model_from_file combined with llama_new_context_with_model instead"); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); @@ -384,27 +360,28 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context + // Does not write null terminator to the buffer LLAMA_API int llama_token_to_str( const struct llama_context * ctx, llama_token token, - char * str, + char * buf, int length); LLAMA_API int llama_token_to_str_bpe( const struct llama_context * ctx, llama_token token, - char * str, + char * buf, int length); LLAMA_API int llama_token_to_str_with_model( const struct llama_model * model, llama_token token, - char * str, + char * buf, int length); // Special tokens - LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence - LLAMA_API llama_token llama_token_eos(); // end-of-sentence - LLAMA_API llama_token llama_token_nl(); // next-line + LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence + LLAMA_API llama_token llama_token_eos(void); // end-of-sentence + LLAMA_API llama_token llama_token_nl(void); // next-line // Grammar // @@ -484,6 +461,10 @@ extern "C" { // Print system information LLAMA_API const char * llama_print_system_info(void); + // Set callback for all future logging events. + // If this is not called, or NULL is supplied, everything is output on stderr. + LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + #ifdef __cplusplus } #endif