mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-03 17:51:09 +01:00
restore simple.cpp for now
This commit is contained in:
parent
0d2b66c638
commit
5765d7a587
@ -1,14 +1,46 @@
|
||||
#include <stdio.h>
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "llama.h"
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
|
||||
void generate_sequence(llama_context * ctx, int n_ctx, const std::vector<llama_token>& prompt_tokens, float temperature) {
|
||||
// print the tokens from the prompt
|
||||
for (llama_token id : prompt_tokens) {
|
||||
printf("%s", llama_token_to_str(ctx, id));
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
gpt_params params;
|
||||
|
||||
//---------------------------------
|
||||
// Print help :
|
||||
//---------------------------------
|
||||
|
||||
if ( argc == 1 || argv[1][0] == '-' )
|
||||
{
|
||||
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
//---------------------------------
|
||||
@ -75,164 +107,77 @@ void generate_sequence(llama_context * ctx, int n_ctx, const std::vector<llama_t
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
// the maximum number of tokens to generate at a time
|
||||
// TODO: not supported, remove
|
||||
const int CUDA_MAX_TOKENS = 1;
|
||||
llama_token tokens_out[CUDA_MAX_TOKENS];
|
||||
|
||||
// current position in the context window
|
||||
int n_past = 0;
|
||||
//---------------------------------
|
||||
// Main prediction loop :
|
||||
//---------------------------------
|
||||
|
||||
// number of tokens to generate
|
||||
int n_tokens_out;
|
||||
// The LLM keeps a contextual cache memory of previous token evaluation.
|
||||
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
|
||||
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
|
||||
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
|
||||
|
||||
// list of tokens to evaluate
|
||||
// note that at most llama_context_params::n_batch tokens can be evaluated at a time
|
||||
std::vector<llama_token> token_list = prompt_tokens;
|
||||
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
|
||||
{
|
||||
//---------------------------------
|
||||
// Evaluate the tokens :
|
||||
//---------------------------------
|
||||
|
||||
while (n_past < n_ctx) {
|
||||
// evaluate the tokens
|
||||
|
||||
// llama_eval generates one token at a time
|
||||
n_tokens_out = 1;
|
||||
|
||||
// number of threads to use for CPU evaluation - ignored if compiled with CUDA support
|
||||
const int n_threads = 4;
|
||||
// note: llama_eval is not compatible with GPU sampling
|
||||
if (llama_eval(ctx, token_list.data(), token_list.size(), n_past, n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__ );
|
||||
exit(1);
|
||||
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
|
||||
{
|
||||
fprintf( stderr, "%s : failed to eval\n" , __func__ );
|
||||
return 1;
|
||||
}
|
||||
|
||||
// perform sampling on the CPU
|
||||
float * logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
tokens_list.clear();
|
||||
|
||||
//---------------------------------
|
||||
// Select the best prediction :
|
||||
//---------------------------------
|
||||
|
||||
llama_token new_token_id = 0;
|
||||
|
||||
auto logits = llama_get_logits( ctx );
|
||||
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
|
||||
|
||||
// initialize candidate array from logits
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for(llama_token token_id = 0 ; token_id < n_vocab ; token_id++) {
|
||||
candidates.push_back(llama_token_data{ token_id, logits[token_id], 0.0f});
|
||||
candidates.reserve( n_vocab );
|
||||
|
||||
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
|
||||
{
|
||||
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// sample token
|
||||
llama_sample_temperature(ctx, &candidates_p, temperature);
|
||||
tokens_out[0] = llama_sample_token(ctx, &candidates_p);
|
||||
// Select it using the "Greedy sampling" method :
|
||||
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
|
||||
|
||||
// increment the position in the context window
|
||||
n_past += token_list.size() + n_tokens_out - 1;
|
||||
|
||||
token_list.clear();
|
||||
|
||||
// print the new tokens
|
||||
for (int i = 0; i < n_tokens_out; i++) {
|
||||
llama_token new_token_id = tokens_out[i];
|
||||
|
||||
// is it an end of stream ?
|
||||
if (new_token_id == llama_token_eos()) {
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
//return;
|
||||
}
|
||||
|
||||
// print the new token :
|
||||
printf("%s", llama_token_to_str(ctx, new_token_id));
|
||||
// is it an end of stream ?
|
||||
if ( new_token_id == llama_token_eos() )
|
||||
{
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
break;
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
// push the last new token for the next evaluation
|
||||
token_list.push_back(tokens_out[n_tokens_out - 1]);
|
||||
}
|
||||
}
|
||||
// Print the new token :
|
||||
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
|
||||
fflush( stdout );
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 2 || argv[1][0] == '-') {
|
||||
printf("usage: %s <model> <n_ctx> <n_gens> <temp> [prompt]\n", argv[0]);
|
||||
printf(" note: passing a temp parameter will enable GPU sampling\n");
|
||||
return 1 ;
|
||||
}
|
||||
// Push this new token for next evaluation :
|
||||
tokens_list.push_back( new_token_id );
|
||||
|
||||
std::string model = argv[1];
|
||||
struct llama_context_params lparams = llama_context_default_params();
|
||||
} // wend of main loop
|
||||
|
||||
if (argc >= 3) {
|
||||
lparams.n_ctx = std::stoi(argv[2]);
|
||||
} else {
|
||||
lparams.n_ctx = 512;
|
||||
}
|
||||
llama_free( ctx );
|
||||
llama_free_model( model );
|
||||
|
||||
int n_gens;
|
||||
if (argc >= 4) {
|
||||
n_gens = std::stoi(argv[3]);
|
||||
} else {
|
||||
n_gens = 1;
|
||||
}
|
||||
|
||||
float temperature;
|
||||
|
||||
if (argc >= 5) {
|
||||
temperature = std::stof(argv[4]);
|
||||
} else {
|
||||
temperature = 0.8f;
|
||||
}
|
||||
|
||||
std::string prompt;
|
||||
if (argc >= 6) {
|
||||
prompt = argv[5];
|
||||
} else {
|
||||
prompt = "Hello my name is";
|
||||
}
|
||||
|
||||
// initialize llama.cpp
|
||||
bool numa = false;
|
||||
llama_init_backend(numa);
|
||||
|
||||
llama_model * lmodel = llama_load_model_from_file(model.c_str(), lparams);
|
||||
if (lmodel == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, model.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, model.c_str());
|
||||
llama_free_model(lmodel);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> token_list(lparams.n_ctx);
|
||||
int prompt_tokens = llama_tokenize(ctx, prompt.c_str(), token_list.data(), token_list.size(), true);
|
||||
if (prompt_tokens <= 0) {
|
||||
fprintf(stderr, "%s: error: unable to tokenize prompt\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
token_list.resize(prompt_tokens);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
const int max_tokens_list_size = max_context_size - 4 ;
|
||||
|
||||
if ((int)token_list.size() > max_tokens_list_size) {
|
||||
fprintf( stderr, "%s: error: prompt too long (%d tokens, max %d)\n" ,
|
||||
__func__, (int)token_list.size(), max_tokens_list_size );
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
// generate the sequences
|
||||
for (int i = 0; i < n_gens; i++) {
|
||||
printf("==== GENERATION %d ====\n", i + 1);
|
||||
generate_sequence(ctx, max_context_size, token_list, temperature);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
llama_backend_free();
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// EOF
|
||||
|
Loading…
Reference in New Issue
Block a user