From 5765d7a587dc265fc9319e3a3f3551e2f9686f9f Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 15 Jul 2023 12:44:47 +0200 Subject: [PATCH] restore simple.cpp for now --- examples/simple/simple.cpp | 229 ++++++++++++++----------------------- 1 file changed, 87 insertions(+), 142 deletions(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index a4046302e..2d62ebc78 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,14 +1,46 @@ -#include +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "common.h" +#include "llama.h" +#include "build-info.h" + +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include "llama.h" +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#endif -void generate_sequence(llama_context * ctx, int n_ctx, const std::vector& prompt_tokens, float temperature) { - // print the tokens from the prompt - for (llama_token id : prompt_tokens) { - printf("%s", llama_token_to_str(ctx, id)); + +int main(int argc, char ** argv) +{ + gpt_params params; + + //--------------------------------- + // Print help : + //--------------------------------- + + if ( argc == 1 || argv[1][0] == '-' ) + { + printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); + return 1 ; } //--------------------------------- @@ -75,164 +107,77 @@ void generate_sequence(llama_context * ctx, int n_ctx, const std::vector token_list = prompt_tokens; + while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) + { + //--------------------------------- + // Evaluate the tokens : + //--------------------------------- - while (n_past < n_ctx) { - // evaluate the tokens - - // llama_eval generates one token at a time - n_tokens_out = 1; - - // number of threads to use for CPU evaluation - ignored if compiled with CUDA support - const int n_threads = 4; - // note: llama_eval is not compatible with GPU sampling - if (llama_eval(ctx, token_list.data(), token_list.size(), n_past, n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__ ); - exit(1); + if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) + { + fprintf( stderr, "%s : failed to eval\n" , __func__ ); + return 1; } - // perform sampling on the CPU - float * logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); + tokens_list.clear(); + + //--------------------------------- + // Select the best prediction : + //--------------------------------- + + llama_token new_token_id = 0; + + auto logits = llama_get_logits( ctx ); + auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) - // initialize candidate array from logits std::vector candidates; - candidates.reserve(n_vocab); - for(llama_token token_id = 0 ; token_id < n_vocab ; token_id++) { - candidates.push_back(llama_token_data{ token_id, logits[token_id], 0.0f}); + candidates.reserve( n_vocab ); + + for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) + { + candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - // sample token - llama_sample_temperature(ctx, &candidates_p, temperature); - tokens_out[0] = llama_sample_token(ctx, &candidates_p); + // Select it using the "Greedy sampling" method : + new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); - // increment the position in the context window - n_past += token_list.size() + n_tokens_out - 1; - token_list.clear(); - - // print the new tokens - for (int i = 0; i < n_tokens_out; i++) { - llama_token new_token_id = tokens_out[i]; - - // is it an end of stream ? - if (new_token_id == llama_token_eos()) { - fprintf(stderr, " [end of text]\n"); - //return; - } - - // print the new token : - printf("%s", llama_token_to_str(ctx, new_token_id)); + // is it an end of stream ? + if ( new_token_id == llama_token_eos() ) + { + fprintf(stderr, " [end of text]\n"); + break; } - fflush(stdout); - // push the last new token for the next evaluation - token_list.push_back(tokens_out[n_tokens_out - 1]); - } -} + // Print the new token : + printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); + fflush( stdout ); -int main(int argc, char ** argv) { - if (argc < 2 || argv[1][0] == '-') { - printf("usage: %s [prompt]\n", argv[0]); - printf(" note: passing a temp parameter will enable GPU sampling\n"); - return 1 ; - } + // Push this new token for next evaluation : + tokens_list.push_back( new_token_id ); - std::string model = argv[1]; - struct llama_context_params lparams = llama_context_default_params(); + } // wend of main loop - if (argc >= 3) { - lparams.n_ctx = std::stoi(argv[2]); - } else { - lparams.n_ctx = 512; - } + llama_free( ctx ); + llama_free_model( model ); - int n_gens; - if (argc >= 4) { - n_gens = std::stoi(argv[3]); - } else { - n_gens = 1; - } - - float temperature; - - if (argc >= 5) { - temperature = std::stof(argv[4]); - } else { - temperature = 0.8f; - } - - std::string prompt; - if (argc >= 6) { - prompt = argv[5]; - } else { - prompt = "Hello my name is"; - } - - // initialize llama.cpp - bool numa = false; - llama_init_backend(numa); - - llama_model * lmodel = llama_load_model_from_file(model.c_str(), lparams); - if (lmodel == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, model.c_str()); - return 1; - } - - llama_context * ctx = llama_new_context_with_model(lmodel, lparams); - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, model.c_str()); - llama_free_model(lmodel); - return 1; - } - - // tokenize the prompt - std::vector token_list(lparams.n_ctx); - int prompt_tokens = llama_tokenize(ctx, prompt.c_str(), token_list.data(), token_list.size(), true); - if (prompt_tokens <= 0) { - fprintf(stderr, "%s: error: unable to tokenize prompt\n", __func__); - return 1; - } - - token_list.resize(prompt_tokens); - - const int max_context_size = llama_n_ctx(ctx); - const int max_tokens_list_size = max_context_size - 4 ; - - if ((int)token_list.size() > max_tokens_list_size) { - fprintf( stderr, "%s: error: prompt too long (%d tokens, max %d)\n" , - __func__, (int)token_list.size(), max_tokens_list_size ); - return 1; - } - - fprintf(stderr, "\n\n"); - - // generate the sequences - for (int i = 0; i < n_gens; i++) { - printf("==== GENERATION %d ====\n", i + 1); - generate_sequence(ctx, max_context_size, token_list, temperature); - printf("\n\n"); - } - - llama_print_timings(ctx); - llama_free(ctx); + llama_backend_free(); llama_backend_free(); return 0; } + +// EOF