llama.cpp/src/llama-impl.h

#pragma once

#include "llama.h"

#include <string>
#include <vector>
#include <stdexcept>

#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif

//
// logging
//

LLAMA_ATTRIBUTE_FORMAT(2, 3)
void llama_log_internal        (ggml_log_level level, const char * format, ...);
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);

#define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)

//
// helpers
//

struct time_meas {
    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}

    ~time_meas() {
        if (t_start_us >= 0) {
            t_acc += ggml_time_us() - t_start_us;
        }
    }

    const int64_t t_start_us;

    int64_t & t_acc;
};

static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
    size_t last_pos = 0;
    while ((pos = s.find(search, last_pos)) != std::string::npos) {
        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
}

const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
);

// the ring buffer works similarly to std::deque, but with a fixed capacity
template<typename T>
struct ring_buffer {
    ring_buffer(size_t cap) : capacity(cap), data(cap) {}

    T & front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }

    const T & front() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }

    T & back() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }

    const T & back() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }

    void push_back(const T & value) {
        if (capacity == 0) {
            throw std::runtime_error("ring buffer: capacity is zero");
        }

        if (sz == capacity) {
            // advance the start when buffer is full
            first = (first + 1) % capacity;
        } else {
            sz++;
        }
        data[pos] = value;
        pos = (pos + 1) % capacity;
    }

    T pop_front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        T value = data[first];
        first = (first + 1) % capacity;
        sz--;
        return value;
    }

    //T & operator[](size_t i) {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}

    //const T & at(size_t i) const {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}

    const T & rat(size_t i) const {
        if (i >= sz) {
            throw std::runtime_error("ring buffer: index out of bounds");
        }
        return data[(first + sz - i - 1) % capacity];
    }

    std::vector<T> to_vector() const {
        std::vector<T> result;
        result.reserve(sz);
        for (size_t i = 0; i < sz; i++) {
            result.push_back(data[(first + i) % capacity]);
        }
        return result;
    }

    void clear() {
        // here only reset the status of the buffer
        sz = 0;
        first = 0;
        pos = 0;
    }

    bool empty() const {
        return sz == 0;
    }

    size_t size() const {
        return sz;
    }

    size_t capacity = 0;
    size_t sz = 0;
    size_t first = 0;
    size_t pos = 0;
    std::vector<T> data;
};
llama : move vocab, grammar and sampling into separate files (#8508) * llama : move sampling code into llama-sampling ggml-ci * llama : move grammar code into llama-grammar ggml-ci * cont ggml-ci * cont : pre-fetch rules * cont ggml-ci * llama : deprecate llama_sample_grammar * llama : move tokenizers into llama-vocab ggml-ci * make : update llama.cpp deps [no ci] * llama : redirect external API to internal APIs ggml-ci * llama : suffix the internal APIs with "_impl" ggml-ci * llama : clean-up 2024-07-23 12:10:17 +02:00			`#pragma once`

			`#include "llama.h"`

llama : refactor sampling v2 (#9294) - Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings` 2024-09-07 14:16:19 +02:00			`#include <string>`
			`#include <vector>`
			`#include <stdexcept>`

llama : move vocab, grammar and sampling into separate files (#8508) * llama : move sampling code into llama-sampling ggml-ci * llama : move grammar code into llama-grammar ggml-ci * cont ggml-ci * cont : pre-fetch rules * cont ggml-ci * llama : deprecate llama_sample_grammar * llama : move tokenizers into llama-vocab ggml-ci * make : update llama.cpp deps [no ci] * llama : redirect external API to internal APIs ggml-ci * llama : suffix the internal APIs with "_impl" ggml-ci * llama : clean-up 2024-07-23 12:10:17 +02:00			`#ifdef __GNUC__`
			`#ifdef __MINGW32__`
			`#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))`
			`#else`
			`#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))`
			`#endif`
			`#else`
			`#define LLAMA_ATTRIBUTE_FORMAT(...)`
			`#endif`

			`//`
			`// logging`
			`//`

			`LLAMA_ATTRIBUTE_FORMAT(2, 3)`
			`void llama_log_internal (ggml_log_level level, const char * format, ...);`
			`void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);`

common : reimplement logging (#9418) https://github.com/ggerganov/llama.cpp/pull/9418 2024-09-15 19:46:12 +02:00			`#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)`
llama : move vocab, grammar and sampling into separate files (#8508) * llama : move sampling code into llama-sampling ggml-ci * llama : move grammar code into llama-grammar ggml-ci * cont ggml-ci * cont : pre-fetch rules * cont ggml-ci * llama : deprecate llama_sample_grammar * llama : move tokenizers into llama-vocab ggml-ci * make : update llama.cpp deps [no ci] * llama : redirect external API to internal APIs ggml-ci * llama : suffix the internal APIs with "_impl" ggml-ci * llama : clean-up 2024-07-23 12:10:17 +02:00			`#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)`
			`#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)`
			`#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)`
log : add CONT level for continuing previous log entry (#9610) 2024-09-24 09:15:35 +02:00			`#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)`
			`#define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)`
llama : better replace_all (cont) (#8926) * llama : better replace_all (cont) ggml-ci * code : deduplicate replace_all ggml-ci 2024-08-09 17:23:52 +02:00
			`//`
			`// helpers`
			`//`

llama : refactor sampling v2 (#9294) - Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings` 2024-09-07 14:16:19 +02:00			`struct time_meas {`
			`time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}`

			`~time_meas() {`
			`if (t_start_us >= 0) {`
			`t_acc += ggml_time_us() - t_start_us;`
			`}`
			`}`

			`const int64_t t_start_us;`

			`int64_t & t_acc;`
			`};`

llama : better replace_all (cont) (#8926) * llama : better replace_all (cont) ggml-ci * code : deduplicate replace_all ggml-ci 2024-08-09 17:23:52 +02:00			`static void replace_all(std::string & s, const std::string & search, const std::string & replace) {`
			`if (search.empty()) {`
llama : fix time complexity of string replacement (#9163) This change fixes a bug where replacing text in a very long string could cause llama.cpp to hang indefinitely. This is because the algorithm used was quadratic, due to memmove() when s.replace() is called in a loop. It seems most search results and LLM responses actually provide the O(n**2) algorithm, which is a great tragedy. Using a builder string fixes things 2024-08-26 08:09:53 +02:00			`return;`
llama : better replace_all (cont) (#8926) * llama : better replace_all (cont) ggml-ci * code : deduplicate replace_all ggml-ci 2024-08-09 17:23:52 +02:00			`}`
llama : fix time complexity of string replacement (#9163) This change fixes a bug where replacing text in a very long string could cause llama.cpp to hang indefinitely. This is because the algorithm used was quadratic, due to memmove() when s.replace() is called in a loop. It seems most search results and LLM responses actually provide the O(n**2) algorithm, which is a great tragedy. Using a builder string fixes things 2024-08-26 08:09:53 +02:00			`std::string builder;`
			`builder.reserve(s.length());`
llama : better replace_all (cont) (#8926) * llama : better replace_all (cont) ggml-ci * code : deduplicate replace_all ggml-ci 2024-08-09 17:23:52 +02:00			`size_t pos = 0;`
llama : fix time complexity of string replacement (#9163) This change fixes a bug where replacing text in a very long string could cause llama.cpp to hang indefinitely. This is because the algorithm used was quadratic, due to memmove() when s.replace() is called in a loop. It seems most search results and LLM responses actually provide the O(n**2) algorithm, which is a great tragedy. Using a builder string fixes things 2024-08-26 08:09:53 +02:00			`size_t last_pos = 0;`
			`while ((pos = s.find(search, last_pos)) != std::string::npos) {`
			`builder.append(s, last_pos, pos - last_pos);`
			`builder.append(replace);`
			`last_pos = pos + search.length();`
llama : better replace_all (cont) (#8926) * llama : better replace_all (cont) ggml-ci * code : deduplicate replace_all ggml-ci 2024-08-09 17:23:52 +02:00			`}`
llama : fix time complexity of string replacement (#9163) This change fixes a bug where replacing text in a very long string could cause llama.cpp to hang indefinitely. This is because the algorithm used was quadratic, due to memmove() when s.replace() is called in a loop. It seems most search results and LLM responses actually provide the O(n**2) algorithm, which is a great tragedy. Using a builder string fixes things 2024-08-26 08:09:53 +02:00			`builder.append(s, last_pos, std::string::npos);`
			`s = std::move(builder);`
llama : better replace_all (cont) (#8926) * llama : better replace_all (cont) ggml-ci * code : deduplicate replace_all ggml-ci 2024-08-09 17:23:52 +02:00			`}`
llama : refactor sampling v2 (#9294) - Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings` 2024-09-07 14:16:19 +02:00
			`const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(`
			`struct llama_context * ctx`
			`);`

			`// the ring buffer works similarly to std::deque, but with a fixed capacity`
			`template<typename T>`
			`struct ring_buffer {`
			`ring_buffer(size_t cap) : capacity(cap), data(cap) {}`

			`T & front() {`
			`if (sz == 0) {`
			`throw std::runtime_error("ring buffer is empty");`
			`}`
			`return data[first];`
			`}`

			`const T & front() const {`
			`if (sz == 0) {`
			`throw std::runtime_error("ring buffer is empty");`
			`}`
			`return data[first];`
			`}`

			`T & back() {`
			`if (sz == 0) {`
			`throw std::runtime_error("ring buffer is empty");`
			`}`
			`return data[pos];`
			`}`

			`const T & back() const {`
			`if (sz == 0) {`
			`throw std::runtime_error("ring buffer is empty");`
			`}`
			`return data[pos];`
			`}`

			`void push_back(const T & value) {`
llama : refactor samplers internal implementation (#9370) 2024-09-08 15:52:07 +02:00			`if (capacity == 0) {`
			`throw std::runtime_error("ring buffer: capacity is zero");`
			`}`

llama : refactor sampling v2 (#9294) - Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings` 2024-09-07 14:16:19 +02:00			`if (sz == capacity) {`
			`// advance the start when buffer is full`
			`first = (first + 1) % capacity;`
			`} else {`
			`sz++;`
			`}`
			`data[pos] = value;`
			`pos = (pos + 1) % capacity;`
			`}`

			`T pop_front() {`
			`if (sz == 0) {`
			`throw std::runtime_error("ring buffer is empty");`
			`}`
			`T value = data[first];`
			`first = (first + 1) % capacity;`
			`sz--;`
			`return value;`
			`}`

			`//T & operator[](size_t i) {`
			`// if (i >= sz) {`
			`// throw std::runtime_error("ring buffer: index out of bounds");`
			`// }`
			`// return data[(first + i) % capacity];`
			`//}`

			`//const T & at(size_t i) const {`
			`// if (i >= sz) {`
			`// throw std::runtime_error("ring buffer: index out of bounds");`
			`// }`
			`// return data[(first + i) % capacity];`
			`//}`

			`const T & rat(size_t i) const {`
			`if (i >= sz) {`
			`throw std::runtime_error("ring buffer: index out of bounds");`
			`}`
			`return data[(first + sz - i - 1) % capacity];`
			`}`

			`std::vector<T> to_vector() const {`
			`std::vector<T> result;`
			`result.reserve(sz);`
			`for (size_t i = 0; i < sz; i++) {`
			`result.push_back(data[(first + i) % capacity]);`
			`}`
			`return result;`
			`}`

			`void clear() {`
			`// here only reset the status of the buffer`
			`sz = 0;`
			`first = 0;`
			`pos = 0;`
			`}`

			`bool empty() const {`
			`return sz == 0;`
			`}`

			`size_t size() const {`
			`return sz;`
			`}`

			`size_t capacity = 0;`
			`size_t sz = 0;`
			`size_t first = 0;`
			`size_t pos = 0;`
			`std::vector<T> data;`
			`};`