llama.cpp/examples/beam-search/beam-search.cpp

#include "common.h"
#include "llama.h"

#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#   define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif

// Used for debugging to print out beam tokens.
struct ostream_beam_view {
    llama_context * ctx;
    llama_beam_view beam_view;
};

static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
    }
    return os << ')';
}

// Put here anything you want back in beam_search_callback().
struct beam_search_callback_data {
    llama_context * ctx;
    std::vector<llama_token> response;
};

// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
}

// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
//  * Show progress by printing ',' following by number of convergent beam tokens if any.
//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
//    This is also called when the stop condition is met.
//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
    // Mark beams as EOS as needed.
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        llama_beam_view& beam_view = beams_state.beam_views[i];
        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
            beam_view.eob = true;
        }
    }
    printf(",");  // Show progress
    if (const size_t n = beams_state.common_prefix_length) {
        callback_data.response.resize(callback_data.response.size() + n);
        assert(0u < beams_state.n_beams);
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        std::copy(tokens, tokens + n, callback_data.response.end() - n);
        printf("%zu", n);
    }
    fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
    }
#endif
}

int main(int argc, char ** argv)
{
    gpt_params params;
    //params.n_gpu_layers = 200;

    //---------------------------------
    // Print help :
    //---------------------------------

    if ( argc < 2 || argv[1][0] == '-' )
    {
        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
        return 1 ;
    }

    //---------------------------------
    // Load parameters :
    //---------------------------------

    params.model = argv[1];

    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;

    if ( argc > 3 )
    {
        params.prompt = argv[3];
    }

    if ( params.prompt.empty() )
    {
        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
    }

    //---------------------------------
    // Init LLM :
    //---------------------------------

    llama_backend_init(params.numa);

    llama_model * model;
    llama_context * ctx;

    std::tie(model, ctx) = llama_init_from_gpt_params( params );

    if ( model == NULL )
    {
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
        return 1;
    }

    //---------------------------------
    // Tokenize the prompt :
    //---------------------------------

    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);

    const size_t max_context_size     = llama_n_ctx( ctx );
    const size_t max_tokens_list_size = max_context_size - 4 ;

    if (tokens_list.size() > max_tokens_list_size)
    {
        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
             __func__ , tokens_list.size() , max_tokens_list_size );
        return 1;
    }

    fprintf( stderr, "\n\n" );

    // Print the tokens from the prompt :

    for( auto id : tokens_list )
    {
        std::cout << llama_token_to_piece(ctx, id);
    }
    std::cout << std::flush;

    int n_past = 0;

    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
    {
        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
        return 1;
    }
    n_past += tokens_list.size();

    beam_search_callback_data callback_data{ctx, {}};
    size_t const beam_width = static_cast<size_t>(params.n_beams);
    int const n_predict = 256;
    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);

    std::cout << "\n\n";
    for (llama_token const token_id : callback_data.response) {
        std::cout << llama_token_to_piece(ctx,token_id);
    }
    std::cout << std::endl;

    llama_free( ctx );
    llama_free_model( model );

    llama_backend_free();

    return 0;
}
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`#include "common.h"`
			`#include "llama.h"`

			`#include <cassert>`
			`#include <cinttypes>`
			`#include <cmath>`
			`#include <cstdio>`
			`#include <cstring>`
			`#include <ctime>`
			`#include <fstream>`
			`#include <iostream>`
			`#include <string>`
			`#include <vector>`

			`#if defined (__unix__) \|\| (defined (__APPLE__) && defined (__MACH__))`
			`#include <signal.h>`
			`#include <unistd.h>`
			`#elif defined (_WIN32)`
			`#define WIN32_LEAN_AND_MEAN`
build : fix most gcc and clang warnings (#2861) * fix most gcc and clang warnings * baby-llama : remove commented opt_params_adam * fix some MinGW warnings * fix more MinGW warnings 2023-09-01 15:34:50 +02:00			`#ifndef NOMINMAX`
			`# define NOMINMAX`
			`#endif`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`#include <windows.h>`
			`#include <signal.h>`
			`#endif`

			`// Used for debugging to print out beam tokens.`
			`struct ostream_beam_view {`
			`llama_context * ctx;`
			`llama_beam_view beam_view;`
			`};`
check C++ code with -Wmissing-declarations (#3184) 2023-09-15 21:38:27 +02:00
			`static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";`
			`for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {`
llama : more tokenizer fixes (#2810) * tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> 2023-08-27 13:19:19 +02:00			`os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`}`
			`return os << ')';`
			`}`

			`// Put here anything you want back in beam_search_callback().`
			`struct beam_search_callback_data {`
			`llama_context * ctx;`
			`std::vector<llama_token> response;`
			`};`

			`// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.`
			`// For example, eob can be flagged due to maximum token length, stop words, etc.`
check C++ code with -Wmissing-declarations (#3184) 2023-09-15 21:38:27 +02:00			`static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);`
			`}`

			`// Function matching type llama_beam_search_callback_fn_t.`
			`// Custom callback example is called each time the beams lengths increase:`
			`// * Show progress by printing ',' following by number of convergent beam tokens if any.`
			`// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].`
			`// This is also called when the stop condition is met.`
			`// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.`
check C++ code with -Wmissing-declarations (#3184) 2023-09-15 21:38:27 +02:00			`static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`auto& callback_data = static_cast<beam_search_callback_data>(callback_data_ptr);`
			`// Mark beams as EOS as needed.`
			`for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {`
			`llama_beam_view& beam_view = beams_state.beam_views[i];`
			`if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {`
			`beam_view.eob = true;`
			`}`
			`}`
			`printf(","); // Show progress`
			`if (const size_t n = beams_state.common_prefix_length) {`
			`callback_data.response.resize(callback_data.response.size() + n);`
			`assert(0u < beams_state.n_beams);`
			`const llama_token * tokens = beams_state.beam_views[0].tokens;`
			`std::copy(tokens, tokens + n, callback_data.response.end() - n);`
build : fix most gcc and clang warnings (#2861) * fix most gcc and clang warnings * baby-llama : remove commented opt_params_adam * fix some MinGW warnings * fix more MinGW warnings 2023-09-01 15:34:50 +02:00			`printf("%zu", n);`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`}`
			`fflush(stdout);`
			`#if 1 // DEBUG: print current beams for this iteration`
			`std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";`
			`for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {`
			`std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;`
			`}`
			`#endif`
			`}`

			`int main(int argc, char ** argv)`
			`{`
			`gpt_params params;`
			`//params.n_gpu_layers = 200;`

			`//---------------------------------`
			`// Print help :`
			`//---------------------------------`

			`if ( argc < 2 \|\| argv[1][0] == '-' )`
			`{`
			`printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );`
			`return 1 ;`
			`}`

			`//---------------------------------`
			`// Load parameters :`
			`//---------------------------------`

			`params.model = argv[1];`

			`params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;`

			`if ( argc > 3 )`
			`{`
			`params.prompt = argv[3];`
			`}`

			`if ( params.prompt.empty() )`
			`{`
			`params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";`
			`}`

			`//---------------------------------`
			`// Init LLM :`
			`//---------------------------------`

			`llama_backend_init(params.numa);`

			`llama_model * model;`
			`llama_context * ctx;`

			`std::tie(model, ctx) = llama_init_from_gpt_params( params );`

			`if ( model == NULL )`
			`{`
			`fprintf( stderr , "%s: error: unable to load model\n" , __func__ );`
			`return 1;`
			`}`

			`//---------------------------------`
			`// Tokenize the prompt :`
			`//---------------------------------`

			`std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);`

			`const size_t max_context_size = llama_n_ctx( ctx );`
			`const size_t max_tokens_list_size = max_context_size - 4 ;`

			`if (tokens_list.size() > max_tokens_list_size)`
			`{`
build : fix most gcc and clang warnings (#2861) * fix most gcc and clang warnings * baby-llama : remove commented opt_params_adam * fix some MinGW warnings * fix more MinGW warnings 2023-09-01 15:34:50 +02:00			`fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`__func__ , tokens_list.size() , max_tokens_list_size );`
			`return 1;`
			`}`

			`fprintf( stderr, "\n\n" );`

			`// Print the tokens from the prompt :`

			`for( auto id : tokens_list )`
			`{`
llama : more tokenizer fixes (#2810) * tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> 2023-08-27 13:19:19 +02:00			`std::cout << llama_token_to_piece(ctx, id);`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`}`
			`std::cout << std::flush;`

llama : custom attention mask + parallel decoding + no context swaps (#3228) * tests : verify that RoPE is "additive" * llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask) * ggml : ggml_rope now takes a vector with positions instead of n_past * metal : add rope_f16 kernel + optimize cpy kernels * llama : unified KV cache + batch inference API * llama : add new llama_decode() API that works with llama_batch * llama : add cell_max heuristic for more efficient kv_cache * llama : extend llama_kv_cache API * llama : more robust cell_max heuristic + wip shift * metal : disable concurrency optimization * llama : add llama_kv_cache_shift_seq + no more context swaps * llama : apply K-cache roping for Falcon and Baichuan * speculative : fix KV cache management * parallel : example for serving multiple users in parallel * parallel : disable hot-plug to avoid cache fragmentation * fixes : speculative KV cache + llama worst-case graph * llama : extend batch API to select which logits to output * llama : fix worst case graph build * ggml-cuda : update rope implementation for parallel decoding (#3254) * ggml-cuda : update rope implementation for parallel decoding * better solution for p0 computation * fix rope * simpler rope implementation --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * make : add parallel to build + fix static functions in llama.cpp * simple : fix token counting * parallel : various improvements * llama : fix cell_max logic + rename functions * parallel : try smaller batches when the KV cache is fragmented * parallel : fix sequence termination criteria * llama : silence errors KV cache errors * parallel : remove new line from prompt * parallel : process system prompt once + configurable paramters + llama API * parallel : remove question with short answers * parallel : count cache misses * parallel : print misses on each request * parallel : minor * llama : fix n_kv to never become 0 * parallel : rename hot-plug to continuous-batching * llama : improve llama_batch API + simplify parallel example * simple : add parallel decoding support * simple : improve comments + free batch * ggml-cuda : add rope f16, restore performance with parallel decoding (#3272) * ggml-cuda : add rope f16, restore performance * offload KQ_mask with all models * fix rope shift --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * llama : disable MPI for now ggml-ci * train : make KQ_pos memory buffer permanent via dummy scale op * ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275) ggml-ci * parallel : fix bug (extra BOS) + smaller token_prev array * parallel : fix cases where the input prompts can overflow the batch * parallel : add disabled experimental batch chunking in powers of two * llama : llama.h formatting + comments * simple : add README.md * llama : fix kv cache heuristic when context is less than 32 * parallel : fix crash when `-n -1` * llama : simplify returns if/else branches * metal : use mm kernels for batch size > 2 * examples : utilize new llama_get_logits_ith() * examples : add example for batched decoding * examples : do not eval prompt 2 times (close #3348) * server : clear the KV cache beyond n_past before llama_decode * server : avoid context swaps by shifting the KV cache --------- Co-authored-by: slaren <slarengh@gmail.com> 2023-09-28 18:04:36 +02:00			`int n_past = 0;`

llama.cpp : split llama_context_params into model and context params (#3301) * llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used 2023-09-28 21:42:38 +02:00			`if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`{`
			`fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );`
			`return 1;`
			`}`
			`n_past += tokens_list.size();`

			`beam_search_callback_data callback_data{ctx, {}};`
			`size_t const beam_width = static_cast<size_t>(params.n_beams);`
			`int const n_predict = 256;`
llama.cpp : split llama_context_params into model and context params (#3301) * llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used 2023-09-28 21:42:38 +02:00			`llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00
			`std::cout << "\n\n";`
			`for (llama_token const token_id : callback_data.response) {`
llama : more tokenizer fixes (#2810) * tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> 2023-08-27 13:19:19 +02:00			`std::cout << llama_token_to_piece(ctx,token_id);`
llama : add llama_beam_search() (#2267) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs. 2023-08-25 17:18:48 +02:00			`}`
			`std::cout << std::endl;`

			`llama_free( ctx );`
			`llama_free_model( model );`

			`llama_backend_free();`

			`return 0;`
			`}`