llama : functions -> methods (#11110)

This commit is contained in:
Georgi Gerganov 2025-01-06 16:13:01 +02:00
parent c3f9d25706
commit 609ec7e0a0
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
14 changed files with 4625 additions and 4521 deletions

View File

@ -1,5 +1,6 @@
#include "llama-adapter.h"
#include "llama-mmap.h"
#include "llama-model.h"
#include <algorithm>
@ -62,7 +63,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
cvec.tensors.reserve(hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
ggml_backend_buffer_type_t buft = model.select_buft(il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
@ -262,7 +263,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
}
// device buft and device ctx
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
const auto * model_tensor = model.get_tensor(name.c_str());
if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
}

View File

@ -1,5 +1,7 @@
#include "llama-context.h"
#include "llama-mmap.h"
#include <cassert>
#include <cmath>
#include <cstring>
@ -504,7 +506,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
auto * buft = ggml_backend_cpu_buffer_type();
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
auto * output_dev = lctx.model.dev_output.dev;
auto * output_dev = lctx.model.dev_output();
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
if (output_dev_host_buft) {
buft = output_dev_host_buft;

View File

@ -1092,9 +1092,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
for (size_t i = 0; i < cur_p->size; ++i) {
const llama_token id = cur_p->data[i].id;
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
const std::string & piece = grammar.vocab->token_to_piece(id);
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
if (grammar.vocab->is_eog(id)) {
if (!allow_eog) {
cur_p->data[i].logit = -INFINITY;
}
@ -1115,7 +1115,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
GGML_ASSERT(grammar.vocab != nullptr);
if (llama_token_is_eog_impl(*grammar.vocab, token)) {
if (grammar.vocab->is_eog(token)) {
for (const auto & stack : grammar.stacks) {
if (stack.empty()) {
return;
@ -1124,7 +1124,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
GGML_ABORT("fatal error");
}
const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
const std::string & piece = grammar.vocab->token_to_piece(token);
// Note terminating 0 in decoded string
const auto decoded = decode_utf8(piece, grammar.partial_utf8);

View File

@ -79,7 +79,7 @@ bool llama_kv_cache_init(
ggml_backend_buffer_type_t buft;
if (offload) {
auto * dev = model.dev_layer.at(i).dev;
auto * dev = model.dev_layer(i);
buft = ggml_backend_dev_buffer_type(dev);
} else {
buft = ggml_backend_cpu_buffer_type();

View File

@ -35,7 +35,7 @@
// TODO: consider moving to llama-impl.h if needed in more places
#if defined(_WIN32)
std::string llama_format_win_err(DWORD err) {
static std::string llama_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);

View File

@ -7,6 +7,10 @@
#include <cstring>
#include <future>
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB;
const char * llama_file_version_name(llama_fver version) {
switch (version) {
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@ -17,6 +21,49 @@ const char * llama_file_version_name(llama_fver version) {
return "unknown";
}
static std::string llama_model_ftype_name(llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
}
switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
default: return "unknown, may not work";
}
}
namespace GGUFMeta {
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
struct GKV_Base_Type {
@ -1009,3 +1056,17 @@ bool llama_model_loader::load_all_data(
return true;
}
std::string llama_model_loader::ftype_name() const {
return llama_model_ftype_name(ftype);
}
void llama_model_loader::print_info() const {
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
if (n_bytes < GiB) {
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
} else {
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
}
}

View File

@ -155,4 +155,8 @@ struct llama_model_loader {
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data);
std::string ftype_name() const;
void print_info() const;
};

File diff suppressed because it is too large Load Diff

View File

@ -4,79 +4,80 @@
#include "llama-arch.h"
#include "llama-hparams.h"
#include "llama-vocab.h"
#include "llama-mmap.h"
#include "ggml-cpp.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
struct llama_model_loader;
// available models
// TODO: this enum does not follow the enum naming convention
enum llm_type {
MODEL_UNKNOWN,
MODEL_14M,
MODEL_17M,
MODEL_22M,
MODEL_33M,
MODEL_60M,
MODEL_70M,
MODEL_80M,
MODEL_109M,
MODEL_137M,
MODEL_160M,
MODEL_220M,
MODEL_250M,
MODEL_270M,
MODEL_335M,
MODEL_410M,
MODEL_450M,
MODEL_770M,
MODEL_780M,
MODEL_0_5B,
MODEL_1B,
MODEL_1_3B,
MODEL_1_4B,
MODEL_1_5B,
MODEL_1_6B,
MODEL_2B,
MODEL_2_8B,
MODEL_3B,
MODEL_4B,
MODEL_6B,
MODEL_6_9B,
MODEL_7B,
MODEL_8B,
MODEL_9B,
MODEL_11B,
MODEL_12B,
MODEL_13B,
MODEL_14B,
MODEL_15B,
MODEL_16B,
MODEL_20B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
MODEL_35B,
MODEL_40B,
MODEL_65B,
MODEL_70B,
MODEL_236B,
MODEL_314B,
MODEL_671B,
MODEL_SMALL,
MODEL_MEDIUM,
MODEL_LARGE,
MODEL_XL,
MODEL_A1_7B,
MODEL_A2_7B,
MODEL_8x7B,
MODEL_8x22B,
MODEL_16x12B,
MODEL_16x3_8B,
MODEL_10B_128x3_66B,
MODEL_57B_A14B,
MODEL_27B,
LLM_TYPE_UNKNOWN,
LLM_TYPE_14M,
LLM_TYPE_17M,
LLM_TYPE_22M,
LLM_TYPE_33M,
LLM_TYPE_60M,
LLM_TYPE_70M,
LLM_TYPE_80M,
LLM_TYPE_109M,
LLM_TYPE_137M,
LLM_TYPE_160M,
LLM_TYPE_220M,
LLM_TYPE_250M,
LLM_TYPE_270M,
LLM_TYPE_335M,
LLM_TYPE_410M,
LLM_TYPE_450M,
LLM_TYPE_770M,
LLM_TYPE_780M,
LLM_TYPE_0_5B,
LLM_TYPE_1B,
LLM_TYPE_1_3B,
LLM_TYPE_1_4B,
LLM_TYPE_1_5B,
LLM_TYPE_1_6B,
LLM_TYPE_2B,
LLM_TYPE_2_8B,
LLM_TYPE_3B,
LLM_TYPE_4B,
LLM_TYPE_6B,
LLM_TYPE_6_9B,
LLM_TYPE_7B,
LLM_TYPE_8B,
LLM_TYPE_9B,
LLM_TYPE_11B,
LLM_TYPE_12B,
LLM_TYPE_13B,
LLM_TYPE_14B,
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
LLM_TYPE_35B,
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_671B,
LLM_TYPE_SMALL,
LLM_TYPE_MEDIUM,
LLM_TYPE_LARGE,
LLM_TYPE_XL,
LLM_TYPE_A1_7B,
LLM_TYPE_A2_7B,
LLM_TYPE_8x7B,
LLM_TYPE_8x22B,
LLM_TYPE_16x12B,
LLM_TYPE_16x3_8B,
LLM_TYPE_10B_128x3_66B,
LLM_TYPE_57B_A14B,
LLM_TYPE_27B,
};
struct llama_layer_posnet {
@ -286,11 +287,9 @@ struct llama_layer {
};
struct llama_model {
llm_type type = MODEL_UNKNOWN;
llm_type type = LLM_TYPE_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
llama_hparams hparams = {};
@ -319,78 +318,55 @@ struct llama_model {
std::vector<llama_layer> layers;
llama_model_params params;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
llama_split_mode split_mode;
int main_gpu;
int n_gpu_layers;
std::vector<std::string> rpc_servers;
// list of devices used in this model
std::vector<ggml_backend_dev_t> devices;
// lists of buffer types used for each layer
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
struct layer_dev {
ggml_backend_dev_t dev;
buft_list_t * buft_list;
};
layer_dev dev_input = {};
layer_dev dev_output = {};
std::vector<layer_dev> dev_layer;
// contexts where the model tensors metadata is stored
std::vector<ggml_context_ptr> ctxs;
// the model memory buffers for the tensor data
std::vector<ggml_backend_buffer_ptr> bufs;
// model memory mapped files
llama_mmaps mappings;
// objects representing data potentially being locked in memory
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
// total number of parameters in the model
uint64_t n_elements = 0;
explicit llama_model(const struct llama_model_params & params);
~llama_model();
// total size of all the tensors in the model in bytes
size_t n_bytes = 0;
void load_stats (llama_model_loader & ml);
void load_arch (llama_model_loader & ml);
void load_hparams(llama_model_loader & ml);
void load_vocab (llama_model_loader & ml);
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
std::string arch_name() const;
std::string type_name() const;
std::string desc() const;
size_t size() const;
size_t max_nodes() const;
size_t n_devices() const;
// total number of parameters in the model
uint64_t n_elements() const;
void print_info() const;
ggml_backend_dev_t dev_layer(int il) const;
ggml_backend_dev_t dev_output() const;
ggml_backend_buffer_type_t select_buft(int il) const;
const struct ggml_tensor * get_tensor(const char * name) const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
};
const char * llm_type_name(llm_type type);
std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
// used by llama_adapter_cvec
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
// used by llama_adapter_lora
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
size_t llama_model_max_nodes(const llama_model & model);
struct llama_model_loader;
// TODO: become llama_model methods
void llm_load_stats (llama_model_loader & ml, llama_model & model);
void llm_load_arch (llama_model_loader & ml, llama_model & model);
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);

View File

@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
if (qs.model.type == MODEL_70B) {
if (qs.model.type == LLM_TYPE_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
@ -525,18 +525,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
kv_overrides = v->data();
}
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
ml.init_mappings(false); // no prefetching
llama_model model;
llm_load_arch (ml, model);
llm_load_hparams(ml, model);
llm_load_stats (ml, model);
llama_model model(llama_model_default_params());
model.load_arch (ml);
model.load_hparams(ml);
model.load_stats (ml);
struct quantize_state_impl qs(model, params);
if (params->only_copy) {
ftype = model.ftype;
ftype = ml.ftype;
}
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
if (params->imatrix) {

View File

@ -1663,8 +1663,8 @@ struct llama_sampler_dry {
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
std::string word = llama_detokenize(vocab, {token_id}, true);
for (llama_token token_id = 0; token_id < (llama_token) vocab.n_vocab(); token_id++) {
std::string word = vocab.detokenize({token_id}, true);
if (word.find(str) != std::string::npos) {
token_sequences.emplace(token_id, std::vector<llama_token>());
} else {
@ -1681,7 +1681,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
}
}
if (match) {
std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
tokenization.resize(max_tail_len);
}
@ -2153,7 +2153,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
float p_eog_sum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
p_eog_sum += cur_p->data[i].p;
} else {
p_txt_sum += cur_p->data[i].p;
@ -2175,7 +2175,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
float p_sum = 0.0f;
for (size_t i = 0; i < size_org; ++i) {
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
p_sum += cur_p->data[i].p;
cur_p->data[cur_p->size++] = cur_p->data[i];
@ -2203,17 +2203,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
continue;
}
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
if (len0 < 0) {
ctx->buf0.resize(len0);
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
assert(len0 > 0);
}
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
if (len1 < 0) {
ctx->buf1.resize(len1);
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
assert(len1 > 0);
}
@ -2248,7 +2248,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
for (size_t i = 0; i < size_org; ++i) {
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
@ -2269,7 +2269,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
if (n_non_eog == 0) {
cur_p->size = 1;
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
cur_p->data[0].id = ctx->vocab->token_eot();
cur_p->data[0].logit = 1.0f;
return;
@ -2291,7 +2291,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
for (size_t i = 0; i < size_org; ++i) {
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;

File diff suppressed because it is too large Load Diff

View File

@ -4,74 +4,161 @@
#include <string>
#include <vector>
#include <unordered_map>
#include <map>
#include <set>
#include <memory>
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
switch (type) {
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
default: return "unknown";
}
}
struct llm_tokenizer;
struct LLM_KV;
struct llama_model_loader;
struct llama_vocab {
using id = llama_token;
using token = std::string;
using tattr = llama_token_attr;
struct token_data {
token text;
float score;
tattr attr;
std::string text;
float score;
llama_token_attr attr;
};
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
llama_vocab();
~llama_vocab();
void load(llama_model_loader & ml, const LLM_KV & kv);
enum llama_vocab_type get_type() const;
enum llama_vocab_pre_type get_pre_type() const;
// TODO: how to deduplicate with llama_hparams.n_vocab ?
uint32_t n_vocab() const;
std::string type_name() const;
bool is_normal (llama_token id) const;
bool is_unknown (llama_token id) const;
bool is_control (llama_token id) const;
bool is_byte (llama_token id) const;
bool is_user_defined(llama_token id) const;
bool is_unused (llama_token id) const;
bool is_eog (llama_token id) const;
uint8_t token_to_byte(llama_token id) const;
llama_token byte_to_token(uint8_t ch) const;
llama_token text_to_token(const std::string & text) const;
const token_data & get_token_data(llama_token id) const;
const char * token_get_text (llama_token id) const;
float token_get_score(llama_token id) const;
llama_token_attr token_get_attr (llama_token id) const;
llama_token token_bos() const;
llama_token token_eos() const;
llama_token token_eot() const;
llama_token token_eom() const;
llama_token token_unk() const;
llama_token token_cls() const;
llama_token token_sep() const;
llama_token token_nl () const;
llama_token token_pad() const;
llama_token token_prefix() const;
llama_token token_middle() const;
llama_token token_suffix() const;
llama_token token_fim_pre() const;
llama_token token_fim_suf() const;
llama_token token_fim_mid() const;
llama_token token_fim_pad() const;
llama_token token_fim_rep() const;
llama_token token_fim_sep() const;
bool add_space_prefix () const;
bool add_bos_token () const;
bool add_eos_token () const;
bool ignore_merges () const;
bool clean_spaces () const;
bool remove_extra_whitespaces () const;
bool escape_whitespaces () const;
bool treat_whitespace_as_suffix() const;
int max_token_text_len() const;
void print_info() const;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
std::vector<llama_token> tokenize(
std::string raw_text,
bool add_special,
bool parse_special = false) const;
int32_t tokenize(
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) const;
// does not write null-terminator to buf
int32_t token_to_piece(
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special) const;
// use cached data
const std::string & token_to_piece(llama_token token) const;
// check if token0 is contained as a prefix in token1
bool token_is_prefix(
llama_token token0,
llama_token token1) const;
int32_t detokenize(
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) const;
std::string detokenize(
const std::vector<llama_token> & tokens,
bool special) const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
std::string token_to_piece_for_cache(
llama_token token,
bool special) const;
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
id special_bos_id = 1;
id special_eos_id = 2;
id special_eot_id = LLAMA_TOKEN_NULL;
id special_eom_id = LLAMA_TOKEN_NULL;
id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
id special_mask_id = LLAMA_TOKEN_NULL;
llama_token special_bos_id = 1;
llama_token special_eos_id = 2;
llama_token special_eot_id = LLAMA_TOKEN_NULL;
llama_token special_eom_id = LLAMA_TOKEN_NULL;
llama_token special_unk_id = 0;
llama_token special_sep_id = LLAMA_TOKEN_NULL;
llama_token special_pad_id = LLAMA_TOKEN_NULL;
llama_token special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
llama_token special_mask_id = LLAMA_TOKEN_NULL;
id linefeed_id = 13;
llama_token linefeed_id = 13;
// fim tokens
id special_fim_pre_id = LLAMA_TOKEN_NULL;
id special_fim_suf_id = LLAMA_TOKEN_NULL;
id special_fim_mid_id = LLAMA_TOKEN_NULL;
id special_fim_pad_id = LLAMA_TOKEN_NULL;
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
// set of all tokens that cause "end of generation"
std::set<id> special_eog_ids;
llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
// tokenizer flags
bool tokenizer_add_space_prefix = false;
@ -82,101 +169,4 @@ struct llama_vocab {
bool tokenizer_remove_extra_whitespaces = false;
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
std::vector<char> precompiled_charsmap;
llm_tokenizer * tokenizer = nullptr;
llama_vocab() = default;
~llama_vocab();
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
void init_tokenizer();
};
//
// internal API
//
// TODO: rename to llama_tokenize_impl
// TODO: This should probably be in llama.h
std::vector<llama_vocab::id> llama_tokenize_internal(
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special = false);
// TODO: move the API below as member functions of llama_vocab
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special);
// does not write null-terminator to buf
int32_t llama_token_to_piece_impl(
const struct llama_vocab & vocab,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);
// check if token0 is contained as a prefix in token1
bool llama_token_is_prefix_impl(
const struct llama_vocab & vocab,
llama_token token0,
llama_token token1);
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);
std::string llama_detokenize(
const struct llama_vocab & vocab,
const std::vector<llama_token> & tokens,
bool special);

File diff suppressed because it is too large Load Diff