llama : impl

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-12-23 17:32:31 +02:00
parent b0d6b66b7d
commit a7df0714db
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
16 changed files with 230 additions and 209 deletions

View File

@ -638,6 +638,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
// Split utils // Split utils
// //
static const char * const LLM_KV_SPLIT_NO = "split.no"; namespace {
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; const char * const LLM_KV_SPLIT_NO = "split.no";
const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
}

View File

@ -2,15 +2,14 @@
#include "common.h" #include "common.h"
#include <algorithm> #include <algorithm>
#include <cmath>
#include <cstdlib> #include <cstdlib>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <stdio.h>
#include <string.h>
#include <climits> #include <climits>
#include <cstdio>
#include <cstring>
#include <stdexcept> #include <stdexcept>
#if defined(_WIN32) #if defined(_WIN32)

View File

@ -1,7 +1,7 @@
#include "common.h"
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#include "llama-impl.h" #include "llama-context.h"
#include "common.h"
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
@ -9,11 +9,9 @@
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <map>
#include <numeric> #include <numeric>
#include <regex> #include <regex>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include <thread> #include <thread>
#include <mutex> #include <mutex>
@ -330,13 +328,13 @@ int main(int argc, char ** argv) {
} }
} }
const auto &tensors = llama_internal_get_tensor_map(ctx); const auto & tensors = llama_internal_get_tensor_map(ctx);
// check layer tensors // check layer tensors
int included_layers = 0; int included_layers = 0;
int64_t max_nelements = 0; int64_t max_nelements = 0;
bool is_f16 = false; bool is_f16 = false;
for (const auto& kv_tensor : tensors) { for (const auto & kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) { if (!layer_included(params, kv_tensor.first)) {
continue; continue;
} }
@ -371,8 +369,8 @@ int main(int argc, char ** argv) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue; continue;
} }
const auto * qfns = ggml_get_type_traits(type); const auto * qfns = ggml_get_type_traits(type);
const auto * qfns_cpu = ggml_get_type_traits_cpu(type); const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
if (qfns_cpu->from_float && qfns->to_float) { if (qfns_cpu->from_float && qfns->to_float) {
if (params.verbose) { if (params.verbose) {
printf("testing %s ...\n", ggml_type_name(type)); printf("testing %s ...\n", ggml_type_name(type));
@ -382,7 +380,7 @@ int main(int argc, char ** argv) {
error_stats global_stats {}; error_stats global_stats {};
for (const auto& kv_tensor : tensors) { for (const auto & kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) { if (!layer_included(params, kv_tensor.first)) {
continue; continue;
} }

View File

@ -15,6 +15,7 @@ add_library(llama
llama-chat.cpp llama-chat.cpp
llama-context.cpp llama-context.cpp
llama-hparams.cpp llama-hparams.cpp
llama-impl.cpp
llama-grammar.cpp llama-grammar.cpp
llama-kv-cache.cpp llama-kv-cache.cpp
llama-mmap.cpp llama-mmap.cpp

View File

@ -5,6 +5,7 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <cassert> #include <cassert>
#include <stdexcept>
// vec // vec

View File

@ -26,7 +26,9 @@ struct llama_ubatch {
struct llama_sbatch_seq { struct llama_sbatch_seq {
int32_t n_seq_id; int32_t n_seq_id;
llama_seq_id * seq_id; llama_seq_id * seq_id;
size_t offset; size_t offset;
size_t length; size_t length;
}; };
@ -112,8 +114,8 @@ struct llama_sbatch {
if (ubatch.equal_seqs) { if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
memcpy( memcpy(
ubatch.embd + n_embd * (ubatch.n_tokens + i), ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
batch->embd + n_embd * ids[seq.offset + i], batch->embd + (n_embd * ids[seq.offset + i]),
n_embd * sizeof(float) n_embd * sizeof(float)
); );
} }

View File

@ -1,5 +1,7 @@
#include "llama-context.h" #include "llama-context.h"
#include <stdexcept>
// deprecated // deprecated
size_t llama_get_state_size(struct llama_context * ctx) { size_t llama_get_state_size(struct llama_context * ctx) {
return llama_state_get_size(ctx); return llama_state_get_size(ctx);
@ -968,3 +970,8 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
} }
} }
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
) {
return ctx->model.tensors_by_name;
}

View File

@ -219,3 +219,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
out_ids.clear(); out_ids.clear();
} }
} }
// For internal test use
// TODO: remove
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);

View File

@ -1,5 +1,6 @@
#include "llama-grammar.h" #include "llama-grammar.h"
#include "llama-impl.h"
#include "llama-vocab.h" #include "llama-vocab.h"
#include "llama-sampling.h" #include "llama-sampling.h"

View File

@ -1,8 +1,10 @@
#pragma once #pragma once
#include "llama-impl.h" #include "llama.h"
#include <map> #include <map>
#include <string>
#include <vector>
struct llama_vocab; struct llama_vocab;

74
src/llama-impl.cpp Normal file
View File

@ -0,0 +1,74 @@
#include "llama-impl.h"
#include "llama.h"
#include <cstdarg>
struct llama_logger_state {
ggml_log_callback log_callback = llama_log_callback_default;
void * log_callback_user_data = nullptr;
};
static llama_logger_state g_logger_state;
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
time_meas::~time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
ggml_log_set(log_callback, user_data);
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
g_logger_state.log_callback_user_data = user_data;
}
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
} else {
char * buffer2 = new char[len + 1];
vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0;
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
delete[] buffer2;
}
va_end(args_copy);
}
void llama_log_internal(ggml_log_level level, const char * format, ...) {
va_list args;
va_start(args, format);
llama_log_internal_v(level, format, args);
va_end(args);
}
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}

View File

@ -1,10 +1,8 @@
#pragma once #pragma once
#include "llama.h" #include "ggml.h"
#include <string> #include <string>
#include <vector>
#include <stdexcept>
#ifdef __GNUC__ #ifdef __GNUC__
#ifdef __MINGW32__ #ifdef __MINGW32__
@ -40,146 +38,12 @@ std::string format(const char * fmt, ...);
// //
struct time_meas { struct time_meas {
time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} time_meas(int64_t & t_acc, bool disable = false);
~time_meas();
~time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}
const int64_t t_start_us; const int64_t t_start_us;
int64_t & t_acc; int64_t & t_acc;
}; };
static void replace_all(std::string & s, const std::string & search, const std::string & replace) { void replace_all(std::string & s, const std::string & search, const std::string & replace);
if (search.empty()) {
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
// the ring buffer works similarly to std::deque, but with a fixed capacity
template<typename T>
struct ring_buffer {
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
T & front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
const T & front() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
T & back() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
const T & back() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
void push_back(const T & value) {
if (capacity == 0) {
throw std::runtime_error("ring buffer: capacity is zero");
}
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;
} else {
sz++;
}
data[pos] = value;
pos = (pos + 1) % capacity;
}
T pop_front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
T value = data[first];
first = (first + 1) % capacity;
sz--;
return value;
}
//T & operator[](size_t i) {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
//const T & at(size_t i) const {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
const T & rat(size_t i) const {
if (i >= sz) {
throw std::runtime_error("ring buffer: index out of bounds");
}
return data[(first + sz - i - 1) % capacity];
}
std::vector<T> to_vector() const {
std::vector<T> result;
result.reserve(sz);
for (size_t i = 0; i < sz; i++) {
result.push_back(data[(first + i) % capacity]);
}
return result;
}
void clear() {
// here only reset the status of the buffer
sz = 0;
first = 0;
pos = 0;
}
bool empty() const {
return sz == 0;
}
size_t size() const {
return sz;
}
size_t capacity = 0;
size_t sz = 0;
size_t first = 0;
size_t pos = 0;
std::vector<T> data;
};

View File

@ -6,6 +6,7 @@
#include <cstring> #include <cstring>
#include <climits> #include <climits>
#include <stdexcept>
#ifdef __has_include #ifdef __has_include
#if __has_include(<unistd.h>) #if __has_include(<unistd.h>)

View File

@ -4,6 +4,7 @@
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <stdexcept>
const char * llm_type_name(llm_type type) { const char * llm_type_name(llm_type type) {
switch (type) { switch (type) {

View File

@ -1,5 +1,6 @@
#include "llama-sampling.h" #include "llama-sampling.h"
#include "llama-impl.h"
#include "llama-vocab.h" #include "llama-vocab.h"
#include "llama-grammar.h" #include "llama-grammar.h"
@ -14,6 +15,117 @@
#include <numeric> #include <numeric>
#include <random> #include <random>
#include <unordered_map> #include <unordered_map>
#include <stdexcept>
// the ring buffer works similarly to std::deque, but with a fixed capacity
template<typename T>
struct ring_buffer {
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
T & front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
const T & front() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
T & back() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
const T & back() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
void push_back(const T & value) {
if (capacity == 0) {
throw std::runtime_error("ring buffer: capacity is zero");
}
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;
} else {
sz++;
}
data[pos] = value;
pos = (pos + 1) % capacity;
}
T pop_front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
T value = data[first];
first = (first + 1) % capacity;
sz--;
return value;
}
//T & operator[](size_t i) {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
//const T & at(size_t i) const {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
const T & rat(size_t i) const {
if (i >= sz) {
throw std::runtime_error("ring buffer: index out of bounds");
}
return data[(first + sz - i - 1) % capacity];
}
std::vector<T> to_vector() const {
std::vector<T> result;
result.reserve(sz);
for (size_t i = 0; i < sz; i++) {
result.push_back(data[(first + i) % capacity]);
}
return result;
}
void clear() {
// here only reset the status of the buffer
sz = 0;
first = 0;
pos = 0;
}
bool empty() const {
return sz == 0;
}
size_t size() const {
return sz;
}
size_t capacity = 0;
size_t sz = 0;
size_t first = 0;
size_t pos = 0;
std::vector<T> data;
};
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
// iterator for the probabilities // iterator for the probabilities

View File

@ -202,13 +202,6 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
// globals // globals
// //
struct llama_logger_state {
ggml_log_callback log_callback = llama_log_callback_default;
void * log_callback_user_data = nullptr;
};
static llama_logger_state g_logger_state;
static const size_t kiB = 1024; static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB; static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB; static const size_t GiB = 1024*MiB;
@ -17188,46 +17181,3 @@ void llama_perf_context_reset(struct llama_context * ctx) {
ctx->t_p_eval_us = ctx->n_p_eval = 0; ctx->t_p_eval_us = ctx->n_p_eval = 0;
} }
// For internal test use
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
) {
return ctx->model.tensors_by_name;
}
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
ggml_log_set(log_callback, user_data);
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
g_logger_state.log_callback_user_data = user_data;
}
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
} else {
char * buffer2 = new char[len + 1];
vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0;
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
delete[] buffer2;
}
va_end(args_copy);
}
void llama_log_internal(ggml_log_level level, const char * format, ...) {
va_list args;
va_start(args, format);
llama_log_internal_v(level, format, args);
va_end(args);
}
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}