restore simple.cpp for now

This commit is contained in:
slaren 2023-07-15 12:44:47 +02:00
parent 0d2b66c638
commit 5765d7a587

View File

@ -1,14 +1,46 @@
#include <stdio.h> #ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "llama.h" #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
void generate_sequence(llama_context * ctx, int n_ctx, const std::vector<llama_token>& prompt_tokens, float temperature) {
// print the tokens from the prompt int main(int argc, char ** argv)
for (llama_token id : prompt_tokens) { {
printf("%s", llama_token_to_str(ctx, id)); gpt_params params;
//---------------------------------
// Print help :
//---------------------------------
if ( argc == 1 || argv[1][0] == '-' )
{
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
return 1 ;
} }
//--------------------------------- //---------------------------------
@ -75,164 +107,77 @@ void generate_sequence(llama_context * ctx, int n_ctx, const std::vector<llama_t
fflush(stdout); fflush(stdout);
// the maximum number of tokens to generate at a time
// TODO: not supported, remove
const int CUDA_MAX_TOKENS = 1;
llama_token tokens_out[CUDA_MAX_TOKENS];
// current position in the context window //---------------------------------
int n_past = 0; // Main prediction loop :
//---------------------------------
// number of tokens to generate // The LLM keeps a contextual cache memory of previous token evaluation.
int n_tokens_out; // Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
// list of tokens to evaluate while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
// note that at most llama_context_params::n_batch tokens can be evaluated at a time {
std::vector<llama_token> token_list = prompt_tokens; //---------------------------------
// Evaluate the tokens :
//---------------------------------
while (n_past < n_ctx) { if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
// evaluate the tokens {
fprintf( stderr, "%s : failed to eval\n" , __func__ );
// llama_eval generates one token at a time return 1;
n_tokens_out = 1;
// number of threads to use for CPU evaluation - ignored if compiled with CUDA support
const int n_threads = 4;
// note: llama_eval is not compatible with GPU sampling
if (llama_eval(ctx, token_list.data(), token_list.size(), n_past, n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__ );
exit(1);
} }
// perform sampling on the CPU tokens_list.clear();
float * logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx); //---------------------------------
// Select the best prediction :
//---------------------------------
llama_token new_token_id = 0;
auto logits = llama_get_logits( ctx );
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
// initialize candidate array from logits
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve( n_vocab );
for(llama_token token_id = 0 ; token_id < n_vocab ; token_id++) {
candidates.push_back(llama_token_data{ token_id, logits[token_id], 0.0f}); for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
{
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// sample token // Select it using the "Greedy sampling" method :
llama_sample_temperature(ctx, &candidates_p, temperature); new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
tokens_out[0] = llama_sample_token(ctx, &candidates_p);
// increment the position in the context window
n_past += token_list.size() + n_tokens_out - 1;
token_list.clear();
// print the new tokens
for (int i = 0; i < n_tokens_out; i++) {
llama_token new_token_id = tokens_out[i];
// is it an end of stream ? // is it an end of stream ?
if (new_token_id == llama_token_eos()) { if ( new_token_id == llama_token_eos() )
{
fprintf(stderr, " [end of text]\n"); fprintf(stderr, " [end of text]\n");
//return; break;
} }
// print the new token : // Print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id)); printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
} fflush( stdout );
fflush(stdout);
// push the last new token for the next evaluation // Push this new token for next evaluation :
token_list.push_back(tokens_out[n_tokens_out - 1]); tokens_list.push_back( new_token_id );
}
}
int main(int argc, char ** argv) { } // wend of main loop
if (argc < 2 || argv[1][0] == '-') {
printf("usage: %s <model> <n_ctx> <n_gens> <temp> [prompt]\n", argv[0]);
printf(" note: passing a temp parameter will enable GPU sampling\n");
return 1 ;
}
std::string model = argv[1]; llama_free( ctx );
struct llama_context_params lparams = llama_context_default_params(); llama_free_model( model );
if (argc >= 3) { llama_backend_free();
lparams.n_ctx = std::stoi(argv[2]);
} else {
lparams.n_ctx = 512;
}
int n_gens;
if (argc >= 4) {
n_gens = std::stoi(argv[3]);
} else {
n_gens = 1;
}
float temperature;
if (argc >= 5) {
temperature = std::stof(argv[4]);
} else {
temperature = 0.8f;
}
std::string prompt;
if (argc >= 6) {
prompt = argv[5];
} else {
prompt = "Hello my name is";
}
// initialize llama.cpp
bool numa = false;
llama_init_backend(numa);
llama_model * lmodel = llama_load_model_from_file(model.c_str(), lparams);
if (lmodel == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, model.c_str());
return 1;
}
llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, model.c_str());
llama_free_model(lmodel);
return 1;
}
// tokenize the prompt
std::vector<llama_token> token_list(lparams.n_ctx);
int prompt_tokens = llama_tokenize(ctx, prompt.c_str(), token_list.data(), token_list.size(), true);
if (prompt_tokens <= 0) {
fprintf(stderr, "%s: error: unable to tokenize prompt\n", __func__);
return 1;
}
token_list.resize(prompt_tokens);
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4 ;
if ((int)token_list.size() > max_tokens_list_size) {
fprintf( stderr, "%s: error: prompt too long (%d tokens, max %d)\n" ,
__func__, (int)token_list.size(), max_tokens_list_size );
return 1;
}
fprintf(stderr, "\n\n");
// generate the sequences
for (int i = 0; i < n_gens; i++) {
printf("==== GENERATION %d ====\n", i + 1);
generate_sequence(ctx, max_context_size, token_list, temperature);
printf("\n\n");
}
llama_print_timings(ctx);
llama_free(ctx);
llama_backend_free(); llama_backend_free();
return 0; return 0;
} }
// EOF