mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-30 05:43:03 +01:00
llama : functions -> methods (#11110)
This commit is contained in:
parent
c3f9d25706
commit
609ec7e0a0
@ -1,5 +1,6 @@
|
|||||||
#include "llama-adapter.h"
|
#include "llama-adapter.h"
|
||||||
|
|
||||||
|
#include "llama-mmap.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -62,7 +63,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||||||
cvec.tensors.reserve(hparams.n_layer);
|
cvec.tensors.reserve(hparams.n_layer);
|
||||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||||
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
|
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
||||||
ggml_context * ctx = ctx_for_buft(buft);
|
ggml_context * ctx = ctx_for_buft(buft);
|
||||||
if (!ctx) {
|
if (!ctx) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||||
@ -262,7 +263,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||||||
}
|
}
|
||||||
|
|
||||||
// device buft and device ctx
|
// device buft and device ctx
|
||||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
const auto * model_tensor = model.get_tensor(name.c_str());
|
||||||
if (!model_tensor) {
|
if (!model_tensor) {
|
||||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#include "llama-context.h"
|
#include "llama-context.h"
|
||||||
|
|
||||||
|
#include "llama-mmap.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
@ -504,7 +506,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
|||||||
|
|
||||||
auto * buft = ggml_backend_cpu_buffer_type();
|
auto * buft = ggml_backend_cpu_buffer_type();
|
||||||
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
||||||
auto * output_dev = lctx.model.dev_output.dev;
|
auto * output_dev = lctx.model.dev_output();
|
||||||
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
||||||
if (output_dev_host_buft) {
|
if (output_dev_host_buft) {
|
||||||
buft = output_dev_host_buft;
|
buft = output_dev_host_buft;
|
||||||
|
@ -1092,9 +1092,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|||||||
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
const llama_token id = cur_p->data[i].id;
|
const llama_token id = cur_p->data[i].id;
|
||||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
|
const std::string & piece = grammar.vocab->token_to_piece(id);
|
||||||
|
|
||||||
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
|
if (grammar.vocab->is_eog(id)) {
|
||||||
if (!allow_eog) {
|
if (!allow_eog) {
|
||||||
cur_p->data[i].logit = -INFINITY;
|
cur_p->data[i].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
@ -1115,7 +1115,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|||||||
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
||||||
GGML_ASSERT(grammar.vocab != nullptr);
|
GGML_ASSERT(grammar.vocab != nullptr);
|
||||||
|
|
||||||
if (llama_token_is_eog_impl(*grammar.vocab, token)) {
|
if (grammar.vocab->is_eog(token)) {
|
||||||
for (const auto & stack : grammar.stacks) {
|
for (const auto & stack : grammar.stacks) {
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
return;
|
return;
|
||||||
@ -1124,7 +1124,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
|
const std::string & piece = grammar.vocab->token_to_piece(token);
|
||||||
|
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
||||||
|
@ -79,7 +79,7 @@ bool llama_kv_cache_init(
|
|||||||
|
|
||||||
ggml_backend_buffer_type_t buft;
|
ggml_backend_buffer_type_t buft;
|
||||||
if (offload) {
|
if (offload) {
|
||||||
auto * dev = model.dev_layer.at(i).dev;
|
auto * dev = model.dev_layer(i);
|
||||||
buft = ggml_backend_dev_buffer_type(dev);
|
buft = ggml_backend_dev_buffer_type(dev);
|
||||||
} else {
|
} else {
|
||||||
buft = ggml_backend_cpu_buffer_type();
|
buft = ggml_backend_cpu_buffer_type();
|
||||||
|
@ -35,7 +35,7 @@
|
|||||||
|
|
||||||
// TODO: consider moving to llama-impl.h if needed in more places
|
// TODO: consider moving to llama-impl.h if needed in more places
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
std::string llama_format_win_err(DWORD err) {
|
static std::string llama_format_win_err(DWORD err) {
|
||||||
LPSTR buf;
|
LPSTR buf;
|
||||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||||
|
@ -7,6 +7,10 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <future>
|
#include <future>
|
||||||
|
|
||||||
|
static const size_t kiB = 1024;
|
||||||
|
static const size_t MiB = 1024*kiB;
|
||||||
|
static const size_t GiB = 1024*MiB;
|
||||||
|
|
||||||
const char * llama_file_version_name(llama_fver version) {
|
const char * llama_file_version_name(llama_fver version) {
|
||||||
switch (version) {
|
switch (version) {
|
||||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||||
@ -17,6 +21,49 @@ const char * llama_file_version_name(llama_fver version) {
|
|||||||
return "unknown";
|
return "unknown";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||||
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (ftype) {
|
||||||
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||||
|
|
||||||
|
default: return "unknown, may not work";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
namespace GGUFMeta {
|
namespace GGUFMeta {
|
||||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
|
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
|
||||||
struct GKV_Base_Type {
|
struct GKV_Base_Type {
|
||||||
@ -1009,3 +1056,17 @@ bool llama_model_loader::load_all_data(
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_model_loader::ftype_name() const {
|
||||||
|
return llama_model_ftype_name(ftype);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_loader::print_info() const {
|
||||||
|
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
|
||||||
|
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
|
||||||
|
if (n_bytes < GiB) {
|
||||||
|
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -155,4 +155,8 @@ struct llama_model_loader {
|
|||||||
llama_mlocks * lmlocks,
|
llama_mlocks * lmlocks,
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void * progress_callback_user_data);
|
void * progress_callback_user_data);
|
||||||
|
|
||||||
|
std::string ftype_name() const;
|
||||||
|
|
||||||
|
void print_info() const;
|
||||||
};
|
};
|
||||||
|
3995
src/llama-model.cpp
3995
src/llama-model.cpp
File diff suppressed because it is too large
Load Diff
@ -4,79 +4,80 @@
|
|||||||
#include "llama-arch.h"
|
#include "llama-arch.h"
|
||||||
#include "llama-hparams.h"
|
#include "llama-hparams.h"
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
#include "llama-mmap.h"
|
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
struct llama_model_loader;
|
||||||
|
|
||||||
// available models
|
// available models
|
||||||
// TODO: this enum does not follow the enum naming convention
|
|
||||||
enum llm_type {
|
enum llm_type {
|
||||||
MODEL_UNKNOWN,
|
LLM_TYPE_UNKNOWN,
|
||||||
MODEL_14M,
|
LLM_TYPE_14M,
|
||||||
MODEL_17M,
|
LLM_TYPE_17M,
|
||||||
MODEL_22M,
|
LLM_TYPE_22M,
|
||||||
MODEL_33M,
|
LLM_TYPE_33M,
|
||||||
MODEL_60M,
|
LLM_TYPE_60M,
|
||||||
MODEL_70M,
|
LLM_TYPE_70M,
|
||||||
MODEL_80M,
|
LLM_TYPE_80M,
|
||||||
MODEL_109M,
|
LLM_TYPE_109M,
|
||||||
MODEL_137M,
|
LLM_TYPE_137M,
|
||||||
MODEL_160M,
|
LLM_TYPE_160M,
|
||||||
MODEL_220M,
|
LLM_TYPE_220M,
|
||||||
MODEL_250M,
|
LLM_TYPE_250M,
|
||||||
MODEL_270M,
|
LLM_TYPE_270M,
|
||||||
MODEL_335M,
|
LLM_TYPE_335M,
|
||||||
MODEL_410M,
|
LLM_TYPE_410M,
|
||||||
MODEL_450M,
|
LLM_TYPE_450M,
|
||||||
MODEL_770M,
|
LLM_TYPE_770M,
|
||||||
MODEL_780M,
|
LLM_TYPE_780M,
|
||||||
MODEL_0_5B,
|
LLM_TYPE_0_5B,
|
||||||
MODEL_1B,
|
LLM_TYPE_1B,
|
||||||
MODEL_1_3B,
|
LLM_TYPE_1_3B,
|
||||||
MODEL_1_4B,
|
LLM_TYPE_1_4B,
|
||||||
MODEL_1_5B,
|
LLM_TYPE_1_5B,
|
||||||
MODEL_1_6B,
|
LLM_TYPE_1_6B,
|
||||||
MODEL_2B,
|
LLM_TYPE_2B,
|
||||||
MODEL_2_8B,
|
LLM_TYPE_2_8B,
|
||||||
MODEL_3B,
|
LLM_TYPE_3B,
|
||||||
MODEL_4B,
|
LLM_TYPE_4B,
|
||||||
MODEL_6B,
|
LLM_TYPE_6B,
|
||||||
MODEL_6_9B,
|
LLM_TYPE_6_9B,
|
||||||
MODEL_7B,
|
LLM_TYPE_7B,
|
||||||
MODEL_8B,
|
LLM_TYPE_8B,
|
||||||
MODEL_9B,
|
LLM_TYPE_9B,
|
||||||
MODEL_11B,
|
LLM_TYPE_11B,
|
||||||
MODEL_12B,
|
LLM_TYPE_12B,
|
||||||
MODEL_13B,
|
LLM_TYPE_13B,
|
||||||
MODEL_14B,
|
LLM_TYPE_14B,
|
||||||
MODEL_15B,
|
LLM_TYPE_15B,
|
||||||
MODEL_16B,
|
LLM_TYPE_16B,
|
||||||
MODEL_20B,
|
LLM_TYPE_20B,
|
||||||
MODEL_30B,
|
LLM_TYPE_30B,
|
||||||
MODEL_32B,
|
LLM_TYPE_32B,
|
||||||
MODEL_34B,
|
LLM_TYPE_34B,
|
||||||
MODEL_35B,
|
LLM_TYPE_35B,
|
||||||
MODEL_40B,
|
LLM_TYPE_40B,
|
||||||
MODEL_65B,
|
LLM_TYPE_65B,
|
||||||
MODEL_70B,
|
LLM_TYPE_70B,
|
||||||
MODEL_236B,
|
LLM_TYPE_236B,
|
||||||
MODEL_314B,
|
LLM_TYPE_314B,
|
||||||
MODEL_671B,
|
LLM_TYPE_671B,
|
||||||
MODEL_SMALL,
|
LLM_TYPE_SMALL,
|
||||||
MODEL_MEDIUM,
|
LLM_TYPE_MEDIUM,
|
||||||
MODEL_LARGE,
|
LLM_TYPE_LARGE,
|
||||||
MODEL_XL,
|
LLM_TYPE_XL,
|
||||||
MODEL_A1_7B,
|
LLM_TYPE_A1_7B,
|
||||||
MODEL_A2_7B,
|
LLM_TYPE_A2_7B,
|
||||||
MODEL_8x7B,
|
LLM_TYPE_8x7B,
|
||||||
MODEL_8x22B,
|
LLM_TYPE_8x22B,
|
||||||
MODEL_16x12B,
|
LLM_TYPE_16x12B,
|
||||||
MODEL_16x3_8B,
|
LLM_TYPE_16x3_8B,
|
||||||
MODEL_10B_128x3_66B,
|
LLM_TYPE_10B_128x3_66B,
|
||||||
MODEL_57B_A14B,
|
LLM_TYPE_57B_A14B,
|
||||||
MODEL_27B,
|
LLM_TYPE_27B,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_layer_posnet {
|
struct llama_layer_posnet {
|
||||||
@ -286,11 +287,9 @@ struct llama_layer {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct llama_model {
|
struct llama_model {
|
||||||
llm_type type = MODEL_UNKNOWN;
|
llm_type type = LLM_TYPE_UNKNOWN;
|
||||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||||
|
|
||||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
|
||||||
|
|
||||||
std::string name = "n/a";
|
std::string name = "n/a";
|
||||||
|
|
||||||
llama_hparams hparams = {};
|
llama_hparams hparams = {};
|
||||||
@ -319,78 +318,55 @@ struct llama_model {
|
|||||||
|
|
||||||
std::vector<llama_layer> layers;
|
std::vector<llama_layer> layers;
|
||||||
|
|
||||||
|
llama_model_params params;
|
||||||
|
|
||||||
// gguf metadata
|
// gguf metadata
|
||||||
std::unordered_map<std::string, std::string> gguf_kv;
|
std::unordered_map<std::string, std::string> gguf_kv;
|
||||||
|
|
||||||
llama_split_mode split_mode;
|
|
||||||
int main_gpu;
|
|
||||||
int n_gpu_layers;
|
|
||||||
|
|
||||||
std::vector<std::string> rpc_servers;
|
std::vector<std::string> rpc_servers;
|
||||||
|
|
||||||
// list of devices used in this model
|
// list of devices used in this model
|
||||||
std::vector<ggml_backend_dev_t> devices;
|
std::vector<ggml_backend_dev_t> devices;
|
||||||
|
|
||||||
|
|
||||||
// lists of buffer types used for each layer
|
|
||||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
|
||||||
buft_list_t cpu_buft_list;
|
|
||||||
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
|
||||||
|
|
||||||
struct layer_dev {
|
|
||||||
ggml_backend_dev_t dev;
|
|
||||||
buft_list_t * buft_list;
|
|
||||||
};
|
|
||||||
|
|
||||||
layer_dev dev_input = {};
|
|
||||||
layer_dev dev_output = {};
|
|
||||||
std::vector<layer_dev> dev_layer;
|
|
||||||
|
|
||||||
// contexts where the model tensors metadata is stored
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
|
||||||
|
|
||||||
// the model memory buffers for the tensor data
|
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
||||||
|
|
||||||
// model memory mapped files
|
|
||||||
llama_mmaps mappings;
|
|
||||||
|
|
||||||
// objects representing data potentially being locked in memory
|
|
||||||
llama_mlocks mlock_bufs;
|
|
||||||
llama_mlocks mlock_mmaps;
|
|
||||||
|
|
||||||
// for quantize-stats only
|
// for quantize-stats only
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||||
|
|
||||||
int64_t t_load_us = 0;
|
int64_t t_load_us = 0;
|
||||||
int64_t t_start_us = 0;
|
int64_t t_start_us = 0;
|
||||||
|
|
||||||
// total number of parameters in the model
|
explicit llama_model(const struct llama_model_params & params);
|
||||||
uint64_t n_elements = 0;
|
~llama_model();
|
||||||
|
|
||||||
// total size of all the tensors in the model in bytes
|
void load_stats (llama_model_loader & ml);
|
||||||
size_t n_bytes = 0;
|
void load_arch (llama_model_loader & ml);
|
||||||
|
void load_hparams(llama_model_loader & ml);
|
||||||
|
void load_vocab (llama_model_loader & ml);
|
||||||
|
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
|
||||||
|
|
||||||
|
std::string arch_name() const;
|
||||||
|
std::string type_name() const;
|
||||||
|
|
||||||
|
std::string desc() const;
|
||||||
|
|
||||||
|
size_t size() const;
|
||||||
|
size_t max_nodes() const;
|
||||||
|
size_t n_devices() const;
|
||||||
|
|
||||||
|
// total number of parameters in the model
|
||||||
|
uint64_t n_elements() const;
|
||||||
|
|
||||||
|
void print_info() const;
|
||||||
|
|
||||||
|
ggml_backend_dev_t dev_layer(int il) const;
|
||||||
|
ggml_backend_dev_t dev_output() const;
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t select_buft(int il) const;
|
||||||
|
|
||||||
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct impl;
|
||||||
|
std::unique_ptr<impl> pimpl;
|
||||||
};
|
};
|
||||||
|
|
||||||
const char * llm_type_name(llm_type type);
|
const char * llm_type_name(llm_type type);
|
||||||
|
|
||||||
std::string llama_model_arch_name (const llama_model & model);
|
|
||||||
std::string llama_model_type_name (const llama_model & model);
|
|
||||||
std::string llama_model_ftype_name(const llama_model & model);
|
|
||||||
|
|
||||||
// used by llama_adapter_cvec
|
|
||||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
|
|
||||||
|
|
||||||
// used by llama_adapter_lora
|
|
||||||
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
|
|
||||||
|
|
||||||
size_t llama_model_max_nodes(const llama_model & model);
|
|
||||||
|
|
||||||
struct llama_model_loader;
|
|
||||||
|
|
||||||
// TODO: become llama_model methods
|
|
||||||
void llm_load_stats (llama_model_loader & ml, llama_model & model);
|
|
||||||
void llm_load_arch (llama_model_loader & ml, llama_model & model);
|
|
||||||
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
|
|
||||||
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
|
|
||||||
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);
|
|
||||||
|
@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||||
if (qs.model.type == MODEL_70B) {
|
if (qs.model.type == LLM_TYPE_70B) {
|
||||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||||
@ -525,18 +525,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||||
kv_overrides = v->data();
|
kv_overrides = v->data();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
||||||
ml.init_mappings(false); // no prefetching
|
ml.init_mappings(false); // no prefetching
|
||||||
|
|
||||||
llama_model model;
|
llama_model model(llama_model_default_params());
|
||||||
llm_load_arch (ml, model);
|
|
||||||
llm_load_hparams(ml, model);
|
model.load_arch (ml);
|
||||||
llm_load_stats (ml, model);
|
model.load_hparams(ml);
|
||||||
|
model.load_stats (ml);
|
||||||
|
|
||||||
struct quantize_state_impl qs(model, params);
|
struct quantize_state_impl qs(model, params);
|
||||||
|
|
||||||
if (params->only_copy) {
|
if (params->only_copy) {
|
||||||
ftype = model.ftype;
|
ftype = ml.ftype;
|
||||||
}
|
}
|
||||||
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
||||||
if (params->imatrix) {
|
if (params->imatrix) {
|
||||||
|
@ -1663,8 +1663,8 @@ struct llama_sampler_dry {
|
|||||||
|
|
||||||
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
|
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
|
||||||
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
|
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
|
||||||
for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < (llama_token) vocab.n_vocab(); token_id++) {
|
||||||
std::string word = llama_detokenize(vocab, {token_id}, true);
|
std::string word = vocab.detokenize({token_id}, true);
|
||||||
if (word.find(str) != std::string::npos) {
|
if (word.find(str) != std::string::npos) {
|
||||||
token_sequences.emplace(token_id, std::vector<llama_token>());
|
token_sequences.emplace(token_id, std::vector<llama_token>());
|
||||||
} else {
|
} else {
|
||||||
@ -1681,7 +1681,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (match) {
|
if (match) {
|
||||||
std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
|
std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
|
||||||
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
|
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
|
||||||
tokenization.resize(max_tail_len);
|
tokenization.resize(max_tail_len);
|
||||||
}
|
}
|
||||||
@ -2153,7 +2153,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|||||||
float p_eog_sum = 0.0f;
|
float p_eog_sum = 0.0f;
|
||||||
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
||||||
p_eog_sum += cur_p->data[i].p;
|
p_eog_sum += cur_p->data[i].p;
|
||||||
} else {
|
} else {
|
||||||
p_txt_sum += cur_p->data[i].p;
|
p_txt_sum += cur_p->data[i].p;
|
||||||
@ -2175,7 +2175,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|||||||
float p_sum = 0.0f;
|
float p_sum = 0.0f;
|
||||||
|
|
||||||
for (size_t i = 0; i < size_org; ++i) {
|
for (size_t i = 0; i < size_org; ++i) {
|
||||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
||||||
p_sum += cur_p->data[i].p;
|
p_sum += cur_p->data[i].p;
|
||||||
|
|
||||||
cur_p->data[cur_p->size++] = cur_p->data[i];
|
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||||
@ -2203,17 +2203,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||||
if (len0 < 0) {
|
if (len0 < 0) {
|
||||||
ctx->buf0.resize(len0);
|
ctx->buf0.resize(len0);
|
||||||
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||||
assert(len0 > 0);
|
assert(len0 > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||||
if (len1 < 0) {
|
if (len1 < 0) {
|
||||||
ctx->buf1.resize(len1);
|
ctx->buf1.resize(len1);
|
||||||
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||||
assert(len1 > 0);
|
assert(len1 > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2248,7 +2248,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|||||||
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
||||||
|
|
||||||
for (size_t i = 0; i < size_org; ++i) {
|
for (size_t i = 0; i < size_org; ++i) {
|
||||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
||||||
|
|
||||||
if (cur_p->data[i].p < thold && !is_eog) {
|
if (cur_p->data[i].p < thold && !is_eog) {
|
||||||
continue;
|
continue;
|
||||||
@ -2269,7 +2269,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|||||||
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
||||||
if (n_non_eog == 0) {
|
if (n_non_eog == 0) {
|
||||||
cur_p->size = 1;
|
cur_p->size = 1;
|
||||||
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
|
cur_p->data[0].id = ctx->vocab->token_eot();
|
||||||
cur_p->data[0].logit = 1.0f;
|
cur_p->data[0].logit = 1.0f;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
@ -2291,7 +2291,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|||||||
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
||||||
|
|
||||||
for (size_t i = 0; i < size_org; ++i) {
|
for (size_t i = 0; i < size_org; ++i) {
|
||||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
||||||
|
|
||||||
if (cur_p->data[i].p < thold && !is_eog) {
|
if (cur_p->data[i].p < thold && !is_eog) {
|
||||||
continue;
|
continue;
|
||||||
|
1655
src/llama-vocab.cpp
1655
src/llama-vocab.cpp
File diff suppressed because it is too large
Load Diff
@ -4,74 +4,161 @@
|
|||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
#include <memory>
|
||||||
#include <map>
|
|
||||||
#include <set>
|
|
||||||
|
|
||||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
struct LLM_KV;
|
||||||
switch (type) {
|
struct llama_model_loader;
|
||||||
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
|
||||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
|
||||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
|
||||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
|
||||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
|
||||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
|
||||||
default: return "unknown";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llm_tokenizer;
|
|
||||||
|
|
||||||
struct llama_vocab {
|
struct llama_vocab {
|
||||||
using id = llama_token;
|
|
||||||
using token = std::string;
|
|
||||||
using tattr = llama_token_attr;
|
|
||||||
|
|
||||||
struct token_data {
|
struct token_data {
|
||||||
token text;
|
std::string text;
|
||||||
float score;
|
float score;
|
||||||
tattr attr;
|
llama_token_attr attr;
|
||||||
};
|
};
|
||||||
|
|
||||||
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
|
llama_vocab();
|
||||||
|
~llama_vocab();
|
||||||
|
|
||||||
|
void load(llama_model_loader & ml, const LLM_KV & kv);
|
||||||
|
|
||||||
|
enum llama_vocab_type get_type() const;
|
||||||
|
enum llama_vocab_pre_type get_pre_type() const;
|
||||||
|
|
||||||
|
// TODO: how to deduplicate with llama_hparams.n_vocab ?
|
||||||
|
uint32_t n_vocab() const;
|
||||||
|
|
||||||
|
std::string type_name() const;
|
||||||
|
|
||||||
|
bool is_normal (llama_token id) const;
|
||||||
|
bool is_unknown (llama_token id) const;
|
||||||
|
bool is_control (llama_token id) const;
|
||||||
|
bool is_byte (llama_token id) const;
|
||||||
|
bool is_user_defined(llama_token id) const;
|
||||||
|
bool is_unused (llama_token id) const;
|
||||||
|
bool is_eog (llama_token id) const;
|
||||||
|
|
||||||
|
uint8_t token_to_byte(llama_token id) const;
|
||||||
|
llama_token byte_to_token(uint8_t ch) const;
|
||||||
|
|
||||||
|
llama_token text_to_token(const std::string & text) const;
|
||||||
|
|
||||||
|
const token_data & get_token_data(llama_token id) const;
|
||||||
|
|
||||||
|
const char * token_get_text (llama_token id) const;
|
||||||
|
float token_get_score(llama_token id) const;
|
||||||
|
llama_token_attr token_get_attr (llama_token id) const;
|
||||||
|
|
||||||
|
llama_token token_bos() const;
|
||||||
|
llama_token token_eos() const;
|
||||||
|
llama_token token_eot() const;
|
||||||
|
llama_token token_eom() const;
|
||||||
|
llama_token token_unk() const;
|
||||||
|
llama_token token_cls() const;
|
||||||
|
llama_token token_sep() const;
|
||||||
|
llama_token token_nl () const;
|
||||||
|
llama_token token_pad() const;
|
||||||
|
|
||||||
|
llama_token token_prefix() const;
|
||||||
|
llama_token token_middle() const;
|
||||||
|
llama_token token_suffix() const;
|
||||||
|
|
||||||
|
llama_token token_fim_pre() const;
|
||||||
|
llama_token token_fim_suf() const;
|
||||||
|
llama_token token_fim_mid() const;
|
||||||
|
llama_token token_fim_pad() const;
|
||||||
|
llama_token token_fim_rep() const;
|
||||||
|
llama_token token_fim_sep() const;
|
||||||
|
|
||||||
|
bool add_space_prefix () const;
|
||||||
|
bool add_bos_token () const;
|
||||||
|
bool add_eos_token () const;
|
||||||
|
bool ignore_merges () const;
|
||||||
|
bool clean_spaces () const;
|
||||||
|
bool remove_extra_whitespaces () const;
|
||||||
|
bool escape_whitespaces () const;
|
||||||
|
bool treat_whitespace_as_suffix() const;
|
||||||
|
|
||||||
|
int max_token_text_len() const;
|
||||||
|
|
||||||
|
void print_info() const;
|
||||||
|
|
||||||
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||||
|
|
||||||
|
std::vector<llama_token> tokenize(
|
||||||
|
std::string raw_text,
|
||||||
|
bool add_special,
|
||||||
|
bool parse_special = false) const;
|
||||||
|
|
||||||
|
int32_t tokenize(
|
||||||
|
const char * text,
|
||||||
|
int32_t text_len,
|
||||||
|
llama_token * tokens,
|
||||||
|
int32_t n_tokens_max,
|
||||||
|
bool add_special,
|
||||||
|
bool parse_special) const;
|
||||||
|
|
||||||
|
// does not write null-terminator to buf
|
||||||
|
int32_t token_to_piece(
|
||||||
|
llama_token token,
|
||||||
|
char * buf,
|
||||||
|
int32_t length,
|
||||||
|
int32_t lstrip,
|
||||||
|
bool special) const;
|
||||||
|
|
||||||
|
// use cached data
|
||||||
|
const std::string & token_to_piece(llama_token token) const;
|
||||||
|
|
||||||
|
// check if token0 is contained as a prefix in token1
|
||||||
|
bool token_is_prefix(
|
||||||
|
llama_token token0,
|
||||||
|
llama_token token1) const;
|
||||||
|
|
||||||
|
int32_t detokenize(
|
||||||
|
const llama_token * tokens,
|
||||||
|
int32_t n_tokens,
|
||||||
|
char * text,
|
||||||
|
int32_t text_len_max,
|
||||||
|
bool remove_special,
|
||||||
|
bool unparse_special) const;
|
||||||
|
|
||||||
|
std::string detokenize(
|
||||||
|
const std::vector<llama_token> & tokens,
|
||||||
|
bool special) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct impl;
|
||||||
|
std::unique_ptr<impl> pimpl;
|
||||||
|
|
||||||
|
std::string token_to_piece_for_cache(
|
||||||
|
llama_token token,
|
||||||
|
bool special) const;
|
||||||
|
|
||||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
||||||
int max_token_len = 0; // used for optimizing longest token search
|
int max_token_len = 0; // used for optimizing longest token search
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
|
||||||
std::vector<token_data> id_to_token;
|
|
||||||
|
|
||||||
std::vector<id> cache_special_tokens;
|
|
||||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
|
||||||
|
|
||||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
|
||||||
|
|
||||||
// default LLaMA special tokens
|
// default LLaMA special tokens
|
||||||
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
|
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
|
||||||
id special_bos_id = 1;
|
llama_token special_bos_id = 1;
|
||||||
id special_eos_id = 2;
|
llama_token special_eos_id = 2;
|
||||||
id special_eot_id = LLAMA_TOKEN_NULL;
|
llama_token special_eot_id = LLAMA_TOKEN_NULL;
|
||||||
id special_eom_id = LLAMA_TOKEN_NULL;
|
llama_token special_eom_id = LLAMA_TOKEN_NULL;
|
||||||
id special_unk_id = 0;
|
llama_token special_unk_id = 0;
|
||||||
id special_sep_id = LLAMA_TOKEN_NULL;
|
llama_token special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
id special_pad_id = LLAMA_TOKEN_NULL;
|
llama_token special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
llama_token special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
||||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
llama_token special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
id linefeed_id = 13;
|
llama_token linefeed_id = 13;
|
||||||
|
|
||||||
// fim tokens
|
// fim tokens
|
||||||
id special_fim_pre_id = LLAMA_TOKEN_NULL;
|
llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
|
||||||
id special_fim_suf_id = LLAMA_TOKEN_NULL;
|
llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
|
||||||
id special_fim_mid_id = LLAMA_TOKEN_NULL;
|
llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
|
||||||
id special_fim_pad_id = LLAMA_TOKEN_NULL;
|
llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
|
||||||
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
||||||
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
||||||
|
|
||||||
// set of all tokens that cause "end of generation"
|
|
||||||
std::set<id> special_eog_ids;
|
|
||||||
|
|
||||||
// tokenizer flags
|
// tokenizer flags
|
||||||
bool tokenizer_add_space_prefix = false;
|
bool tokenizer_add_space_prefix = false;
|
||||||
@ -82,101 +169,4 @@ struct llama_vocab {
|
|||||||
bool tokenizer_remove_extra_whitespaces = false;
|
bool tokenizer_remove_extra_whitespaces = false;
|
||||||
bool tokenizer_escape_whitespaces = true;
|
bool tokenizer_escape_whitespaces = true;
|
||||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||||
|
|
||||||
std::vector<char> precompiled_charsmap;
|
|
||||||
|
|
||||||
llm_tokenizer * tokenizer = nullptr;
|
|
||||||
|
|
||||||
llama_vocab() = default;
|
|
||||||
~llama_vocab();
|
|
||||||
|
|
||||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
|
||||||
|
|
||||||
void init_tokenizer();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
|
||||||
// internal API
|
|
||||||
//
|
|
||||||
|
|
||||||
// TODO: rename to llama_tokenize_impl
|
|
||||||
// TODO: This should probably be in llama.h
|
|
||||||
std::vector<llama_vocab::id> llama_tokenize_internal(
|
|
||||||
const llama_vocab & vocab,
|
|
||||||
std::string raw_text,
|
|
||||||
bool add_special,
|
|
||||||
bool parse_special = false);
|
|
||||||
|
|
||||||
// TODO: move the API below as member functions of llama_vocab
|
|
||||||
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
|
|
||||||
|
|
||||||
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
|
|
||||||
|
|
||||||
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
|
|
||||||
|
|
||||||
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
|
|
||||||
|
|
||||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
|
|
||||||
|
|
||||||
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
|
|
||||||
|
|
||||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
|
||||||
|
|
||||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
|
||||||
|
|
||||||
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
|
|
||||||
|
|
||||||
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
|
||||||
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
|
||||||
|
|
||||||
int32_t llama_tokenize_impl(
|
|
||||||
const struct llama_vocab & vocab,
|
|
||||||
const char * text,
|
|
||||||
int32_t text_len,
|
|
||||||
llama_token * tokens,
|
|
||||||
int32_t n_tokens_max,
|
|
||||||
bool add_special,
|
|
||||||
bool parse_special);
|
|
||||||
|
|
||||||
// does not write null-terminator to buf
|
|
||||||
int32_t llama_token_to_piece_impl(
|
|
||||||
const struct llama_vocab & vocab,
|
|
||||||
llama_token token,
|
|
||||||
char * buf,
|
|
||||||
int32_t length,
|
|
||||||
int32_t lstrip,
|
|
||||||
bool special);
|
|
||||||
|
|
||||||
// check if token0 is contained as a prefix in token1
|
|
||||||
bool llama_token_is_prefix_impl(
|
|
||||||
const struct llama_vocab & vocab,
|
|
||||||
llama_token token0,
|
|
||||||
llama_token token1);
|
|
||||||
|
|
||||||
int32_t llama_detokenize_impl(
|
|
||||||
const struct llama_vocab & vocab,
|
|
||||||
const llama_token * tokens,
|
|
||||||
int32_t n_tokens,
|
|
||||||
char * text,
|
|
||||||
int32_t text_len_max,
|
|
||||||
bool remove_special,
|
|
||||||
bool unparse_special);
|
|
||||||
|
|
||||||
std::string llama_detokenize(
|
|
||||||
const struct llama_vocab & vocab,
|
|
||||||
const std::vector<llama_token> & tokens,
|
|
||||||
bool special);
|
|
||||||
|
2738
src/llama.cpp
2738
src/llama.cpp
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user