llama : impl

ggml-ci
2025-01-12 13:27:21 +01:00 · 2024-12-23 17:32:31 +02:00 · 2024-12-23 17:32:31 +02:00 · a7df0714db
commit a7df0714db
parent b0d6b66b7d
16 changed files with 230 additions and 209 deletions
--- a/common/common.h
+++ b/common/common.h
@ -638,6 +638,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 // Split utils
 //
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
+namespace {
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+const char * const LLM_KV_SPLIT_NO            = "split.no";
 const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 }
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -2,15 +2,14 @@
 #include "common.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
 #include <fstream>
 #include <string>
 #include <vector>
 #include <stdio.h>
 #include <string.h>
 #include <climits>
 #include <cstdio>
 #include <cstring>
 #include <stdexcept>
 #if defined(_WIN32)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -1,7 +1,7 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "llama-impl.h"
+#include "llama-context.h"
 #include "common.h"
 #include <algorithm>
 #include <cassert>
@ -9,11 +9,9 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <map>
 #include <numeric>
 #include <regex>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <thread>
 #include <mutex>
@ -330,13 +328,13 @@ int main(int argc, char ** argv) {
        }
    }
-    const auto &tensors = llama_internal_get_tensor_map(ctx);
+    const auto & tensors = llama_internal_get_tensor_map(ctx);
    // check layer tensors
    int included_layers = 0;
    int64_t max_nelements = 0;
    bool is_f16 = false;
-    for (const auto& kv_tensor : tensors) {
+    for (const auto & kv_tensor : tensors) {
        if (!layer_included(params, kv_tensor.first)) {
            continue;
        }
@ -371,8 +369,8 @@ int main(int argc, char ** argv) {
        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
            continue;
        }
-        const auto *  qfns     = ggml_get_type_traits(type);
+        const auto * qfns     = ggml_get_type_traits(type);
-        const auto *  qfns_cpu = ggml_get_type_traits_cpu(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
        if (qfns_cpu->from_float && qfns->to_float) {
            if (params.verbose) {
                printf("testing %s ...\n",  ggml_type_name(type));
@ -382,7 +380,7 @@ int main(int argc, char ** argv) {
            error_stats global_stats {};
-            for (const auto& kv_tensor : tensors) {
+            for (const auto & kv_tensor : tensors) {
                if (!layer_included(params, kv_tensor.first)) {
                    continue;
                }
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -15,6 +15,7 @@ add_library(llama
            llama-chat.cpp
            llama-context.cpp
            llama-hparams.cpp
            llama-impl.cpp
            llama-grammar.cpp
            llama-kv-cache.cpp
            llama-mmap.cpp
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@ -5,6 +5,7 @@
 #include <algorithm>
 #include <map>
 #include <cassert>
 #include <stdexcept>
 // vec
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@ -26,7 +26,9 @@ struct llama_ubatch {
 struct llama_sbatch_seq {
    int32_t n_seq_id;
    llama_seq_id * seq_id;
    size_t offset;
    size_t length;
 };
@ -112,8 +114,8 @@ struct llama_sbatch {
            if (ubatch.equal_seqs) {
                for (size_t i = 0; i < length; ++i) {
                    memcpy(
-                        ubatch.embd + n_embd * (ubatch.n_tokens + i),
+                        ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
-                        batch->embd + n_embd * ids[seq.offset + i],
+                        batch->embd + (n_embd * ids[seq.offset + i]),
                        n_embd * sizeof(float)
                    );
                }
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -1,5 +1,7 @@
 #include "llama-context.h"
 #include <stdexcept>
 // deprecated
 size_t llama_get_state_size(struct llama_context * ctx) {
    return llama_state_get_size(ctx);
@ -968,3 +970,8 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
    }
 }
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 ) {
    return ctx->model.tensors_by_name;
 }
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -219,3 +219,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
        out_ids.clear();
    }
 }
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -1,5 +1,6 @@
 #include "llama-grammar.h"
 #include "llama-impl.h"
 #include "llama-vocab.h"
 #include "llama-sampling.h"
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@ -1,8 +1,10 @@
 #pragma once
-#include "llama-impl.h"
+#include "llama.h"
 #include <map>
 #include <string>
 #include <vector>
 struct llama_vocab;
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@ -0,0 +1,74 @@
 #include "llama-impl.h"
 #include "llama.h"
 #include <cstdarg>
 struct llama_logger_state {
    ggml_log_callback log_callback = llama_log_callback_default;
    void * log_callback_user_data = nullptr;
 };
 static llama_logger_state g_logger_state;
 time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
 time_meas::~time_meas() {
        if (t_start_us >= 0) {
            t_acc += ggml_time_us() - t_start_us;
        }
    }
 void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
    size_t last_pos = 0;
    while ((pos = s.find(search, last_pos)) != std::string::npos) {
        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
    ggml_log_set(log_callback, user_data);
    g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
    g_logger_state.log_callback_user_data = user_data;
 }
 static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
    va_list args_copy;
    va_copy(args_copy, args);
    char buffer[128];
    int len = vsnprintf(buffer, 128, format, args);
    if (len < 128) {
        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
    } else {
        char * buffer2 = new char[len + 1];
        vsnprintf(buffer2, len + 1, format, args_copy);
        buffer2[len] = 0;
        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
        delete[] buffer2;
    }
    va_end(args_copy);
 }
 void llama_log_internal(ggml_log_level level, const char * format, ...) {
    va_list args;
    va_start(args, format);
    llama_log_internal_v(level, format, args);
    va_end(args);
 }
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) user_data;
    fputs(text, stderr);
    fflush(stderr);
 }
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -1,10 +1,8 @@
 #pragma once
-#include "llama.h"
+#include "ggml.h"
 #include <string>
 #include <vector>
 #include <stdexcept>
 #ifdef __GNUC__
 #ifdef __MINGW32__
@ -40,146 +38,12 @@ std::string format(const char * fmt, ...);
 //
 struct time_meas {
-    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+    time_meas(int64_t & t_acc, bool disable = false);
-
+    ~time_meas();
    ~time_meas() {
        if (t_start_us >= 0) {
            t_acc += ggml_time_us() - t_start_us;
        }
    }
    const int64_t t_start_us;
    int64_t & t_acc;
 };
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+void replace_all(std::string & s, const std::string & search, const std::string & replace);
    if (search.empty()) {
        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
    size_t last_pos = 0;
    while ((pos = s.find(search, last_pos)) != std::string::npos) {
        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 template<typename T>
 struct ring_buffer {
    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
    T & front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    const T & front() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    T & back() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    const T & back() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    void push_back(const T & value) {
        if (capacity == 0) {
            throw std::runtime_error("ring buffer: capacity is zero");
        }
        if (sz == capacity) {
            // advance the start when buffer is full
            first = (first + 1) % capacity;
        } else {
            sz++;
        }
        data[pos] = value;
        pos = (pos + 1) % capacity;
    }
    T pop_front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        T value = data[first];
        first = (first + 1) % capacity;
        sz--;
        return value;
    }
    //T & operator[](size_t i) {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    //const T & at(size_t i) const {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    const T & rat(size_t i) const {
        if (i >= sz) {
            throw std::runtime_error("ring buffer: index out of bounds");
        }
        return data[(first + sz - i - 1) % capacity];
    }
    std::vector<T> to_vector() const {
        std::vector<T> result;
        result.reserve(sz);
        for (size_t i = 0; i < sz; i++) {
            result.push_back(data[(first + i) % capacity]);
        }
        return result;
    }
    void clear() {
        // here only reset the status of the buffer
        sz = 0;
        first = 0;
        pos = 0;
    }
    bool empty() const {
        return sz == 0;
    }
    size_t size() const {
        return sz;
    }
    size_t capacity = 0;
    size_t sz = 0;
    size_t first = 0;
    size_t pos = 0;
    std::vector<T> data;
 };
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@ -6,6 +6,7 @@
 #include <cstring>
 #include <climits>
 #include <stdexcept>
 #ifdef __has_include
    #if __has_include(<unistd.h>)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -4,6 +4,7 @@
 #include <algorithm>
 #include <cassert>
 #include <stdexcept>
 const char * llm_type_name(llm_type type) {
    switch (type) {
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -1,5 +1,6 @@
 #include "llama-sampling.h"
 #include "llama-impl.h"
 #include "llama-vocab.h"
 #include "llama-grammar.h"
@ -14,6 +15,117 @@
 #include <numeric>
 #include <random>
 #include <unordered_map>
 #include <stdexcept>
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 template<typename T>
 struct ring_buffer {
    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
    T & front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    const T & front() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    T & back() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    const T & back() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    void push_back(const T & value) {
        if (capacity == 0) {
            throw std::runtime_error("ring buffer: capacity is zero");
        }
        if (sz == capacity) {
            // advance the start when buffer is full
            first = (first + 1) % capacity;
        } else {
            sz++;
        }
        data[pos] = value;
        pos = (pos + 1) % capacity;
    }
    T pop_front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        T value = data[first];
        first = (first + 1) % capacity;
        sz--;
        return value;
    }
    //T & operator[](size_t i) {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    //const T & at(size_t i) const {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    const T & rat(size_t i) const {
        if (i >= sz) {
            throw std::runtime_error("ring buffer: index out of bounds");
        }
        return data[(first + sz - i - 1) % capacity];
    }
    std::vector<T> to_vector() const {
        std::vector<T> result;
        result.reserve(sz);
        for (size_t i = 0; i < sz; i++) {
            result.push_back(data[(first + i) % capacity]);
        }
        return result;
    }
    void clear() {
        // here only reset the status of the buffer
        sz = 0;
        first = 0;
        pos = 0;
    }
    bool empty() const {
        return sz == 0;
    }
    size_t size() const {
        return sz;
    }
    size_t capacity = 0;
    size_t sz = 0;
    size_t first = 0;
    size_t pos = 0;
    std::vector<T> data;
 };
 static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
    // iterator for the probabilities
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -202,13 +202,6 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
 // globals
 //
 struct llama_logger_state {
    ggml_log_callback log_callback = llama_log_callback_default;
    void * log_callback_user_data = nullptr;
 };
 static llama_logger_state g_logger_state;
 static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
@ -17188,46 +17181,3 @@ void llama_perf_context_reset(struct llama_context * ctx) {
    ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
 // For internal test use
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 ) {
    return ctx->model.tensors_by_name;
 }
 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
    ggml_log_set(log_callback, user_data);
    g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
    g_logger_state.log_callback_user_data = user_data;
 }
 static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
    va_list args_copy;
    va_copy(args_copy, args);
    char buffer[128];
    int len = vsnprintf(buffer, 128, format, args);
    if (len < 128) {
        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
    } else {
        char * buffer2 = new char[len + 1];
        vsnprintf(buffer2, len + 1, format, args_copy);
        buffer2[len] = 0;
        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
        delete[] buffer2;
    }
    va_end(args_copy);
 }
 void llama_log_internal(ggml_log_level level, const char * format, ...) {
    va_list args;
    va_start(args, format);
    llama_log_internal_v(level, format, args);
    va_end(args);
 }
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) user_data;
    fputs(text, stderr);
    fflush(stderr);
 }