mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 12:33:06 +01:00
llama : functions -> methods (#11110)
This commit is contained in:
parent
c3f9d25706
commit
609ec7e0a0
@ -1,5 +1,6 @@
|
||||
#include "llama-adapter.h"
|
||||
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-model.h"
|
||||
|
||||
#include <algorithm>
|
||||
@ -62,7 +63,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
||||
cvec.tensors.reserve(hparams.n_layer);
|
||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
|
||||
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||
@ -262,7 +263,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||
}
|
||||
|
||||
// device buft and device ctx
|
||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||
const auto * model_tensor = model.get_tensor(name.c_str());
|
||||
if (!model_tensor) {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
#include "llama-context.h"
|
||||
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
@ -504,7 +506,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
||||
|
||||
auto * buft = ggml_backend_cpu_buffer_type();
|
||||
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
||||
auto * output_dev = lctx.model.dev_output.dev;
|
||||
auto * output_dev = lctx.model.dev_output();
|
||||
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
||||
if (output_dev_host_buft) {
|
||||
buft = output_dev_host_buft;
|
||||
|
@ -1092,9 +1092,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
const llama_token id = cur_p->data[i].id;
|
||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
|
||||
const std::string & piece = grammar.vocab->token_to_piece(id);
|
||||
|
||||
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
|
||||
if (grammar.vocab->is_eog(id)) {
|
||||
if (!allow_eog) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
@ -1115,7 +1115,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
||||
GGML_ASSERT(grammar.vocab != nullptr);
|
||||
|
||||
if (llama_token_is_eog_impl(*grammar.vocab, token)) {
|
||||
if (grammar.vocab->is_eog(token)) {
|
||||
for (const auto & stack : grammar.stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
@ -1124,7 +1124,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
|
||||
const std::string & piece = grammar.vocab->token_to_piece(token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
||||
|
@ -79,7 +79,7 @@ bool llama_kv_cache_init(
|
||||
|
||||
ggml_backend_buffer_type_t buft;
|
||||
if (offload) {
|
||||
auto * dev = model.dev_layer.at(i).dev;
|
||||
auto * dev = model.dev_layer(i);
|
||||
buft = ggml_backend_dev_buffer_type(dev);
|
||||
} else {
|
||||
buft = ggml_backend_cpu_buffer_type();
|
||||
|
@ -35,7 +35,7 @@
|
||||
|
||||
// TODO: consider moving to llama-impl.h if needed in more places
|
||||
#if defined(_WIN32)
|
||||
std::string llama_format_win_err(DWORD err) {
|
||||
static std::string llama_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
|
@ -7,6 +7,10 @@
|
||||
#include <cstring>
|
||||
#include <future>
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
static const size_t MiB = 1024*kiB;
|
||||
static const size_t GiB = 1024*MiB;
|
||||
|
||||
const char * llama_file_version_name(llama_fver version) {
|
||||
switch (version) {
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||
@ -17,6 +21,49 @@ const char * llama_file_version_name(llama_fver version) {
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||
}
|
||||
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
|
||||
namespace GGUFMeta {
|
||||
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
|
||||
struct GKV_Base_Type {
|
||||
@ -1009,3 +1056,17 @@ bool llama_model_loader::load_all_data(
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string llama_model_loader::ftype_name() const {
|
||||
return llama_model_ftype_name(ftype);
|
||||
}
|
||||
|
||||
void llama_model_loader::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
|
||||
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
|
||||
if (n_bytes < GiB) {
|
||||
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
}
|
||||
}
|
||||
|
@ -155,4 +155,8 @@ struct llama_model_loader {
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data);
|
||||
|
||||
std::string ftype_name() const;
|
||||
|
||||
void print_info() const;
|
||||
};
|
||||
|
4095
src/llama-model.cpp
4095
src/llama-model.cpp
File diff suppressed because it is too large
Load Diff
@ -4,79 +4,80 @@
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
struct llama_model_loader;
|
||||
|
||||
// available models
|
||||
// TODO: this enum does not follow the enum naming convention
|
||||
enum llm_type {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_14M,
|
||||
MODEL_17M,
|
||||
MODEL_22M,
|
||||
MODEL_33M,
|
||||
MODEL_60M,
|
||||
MODEL_70M,
|
||||
MODEL_80M,
|
||||
MODEL_109M,
|
||||
MODEL_137M,
|
||||
MODEL_160M,
|
||||
MODEL_220M,
|
||||
MODEL_250M,
|
||||
MODEL_270M,
|
||||
MODEL_335M,
|
||||
MODEL_410M,
|
||||
MODEL_450M,
|
||||
MODEL_770M,
|
||||
MODEL_780M,
|
||||
MODEL_0_5B,
|
||||
MODEL_1B,
|
||||
MODEL_1_3B,
|
||||
MODEL_1_4B,
|
||||
MODEL_1_5B,
|
||||
MODEL_1_6B,
|
||||
MODEL_2B,
|
||||
MODEL_2_8B,
|
||||
MODEL_3B,
|
||||
MODEL_4B,
|
||||
MODEL_6B,
|
||||
MODEL_6_9B,
|
||||
MODEL_7B,
|
||||
MODEL_8B,
|
||||
MODEL_9B,
|
||||
MODEL_11B,
|
||||
MODEL_12B,
|
||||
MODEL_13B,
|
||||
MODEL_14B,
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
MODEL_35B,
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_671B,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
MODEL_XL,
|
||||
MODEL_A1_7B,
|
||||
MODEL_A2_7B,
|
||||
MODEL_8x7B,
|
||||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_16x3_8B,
|
||||
MODEL_10B_128x3_66B,
|
||||
MODEL_57B_A14B,
|
||||
MODEL_27B,
|
||||
LLM_TYPE_UNKNOWN,
|
||||
LLM_TYPE_14M,
|
||||
LLM_TYPE_17M,
|
||||
LLM_TYPE_22M,
|
||||
LLM_TYPE_33M,
|
||||
LLM_TYPE_60M,
|
||||
LLM_TYPE_70M,
|
||||
LLM_TYPE_80M,
|
||||
LLM_TYPE_109M,
|
||||
LLM_TYPE_137M,
|
||||
LLM_TYPE_160M,
|
||||
LLM_TYPE_220M,
|
||||
LLM_TYPE_250M,
|
||||
LLM_TYPE_270M,
|
||||
LLM_TYPE_335M,
|
||||
LLM_TYPE_410M,
|
||||
LLM_TYPE_450M,
|
||||
LLM_TYPE_770M,
|
||||
LLM_TYPE_780M,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_1B,
|
||||
LLM_TYPE_1_3B,
|
||||
LLM_TYPE_1_4B,
|
||||
LLM_TYPE_1_5B,
|
||||
LLM_TYPE_1_6B,
|
||||
LLM_TYPE_2B,
|
||||
LLM_TYPE_2_8B,
|
||||
LLM_TYPE_3B,
|
||||
LLM_TYPE_4B,
|
||||
LLM_TYPE_6B,
|
||||
LLM_TYPE_6_9B,
|
||||
LLM_TYPE_7B,
|
||||
LLM_TYPE_8B,
|
||||
LLM_TYPE_9B,
|
||||
LLM_TYPE_11B,
|
||||
LLM_TYPE_12B,
|
||||
LLM_TYPE_13B,
|
||||
LLM_TYPE_14B,
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
LLM_TYPE_35B,
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
LLM_TYPE_SMALL,
|
||||
LLM_TYPE_MEDIUM,
|
||||
LLM_TYPE_LARGE,
|
||||
LLM_TYPE_XL,
|
||||
LLM_TYPE_A1_7B,
|
||||
LLM_TYPE_A2_7B,
|
||||
LLM_TYPE_8x7B,
|
||||
LLM_TYPE_8x22B,
|
||||
LLM_TYPE_16x12B,
|
||||
LLM_TYPE_16x3_8B,
|
||||
LLM_TYPE_10B_128x3_66B,
|
||||
LLM_TYPE_57B_A14B,
|
||||
LLM_TYPE_27B,
|
||||
};
|
||||
|
||||
struct llama_layer_posnet {
|
||||
@ -286,11 +287,9 @@ struct llama_layer {
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
llm_type type = MODEL_UNKNOWN;
|
||||
llm_type type = LLM_TYPE_UNKNOWN;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||
|
||||
std::string name = "n/a";
|
||||
|
||||
llama_hparams hparams = {};
|
||||
@ -319,78 +318,55 @@ struct llama_model {
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
llama_model_params params;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
int n_gpu_layers;
|
||||
|
||||
std::vector<std::string> rpc_servers;
|
||||
|
||||
// list of devices used in this model
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
buft_list_t cpu_buft_list;
|
||||
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
||||
|
||||
struct layer_dev {
|
||||
ggml_backend_dev_t dev;
|
||||
buft_list_t * buft_list;
|
||||
};
|
||||
|
||||
layer_dev dev_input = {};
|
||||
layer_dev dev_output = {};
|
||||
std::vector<layer_dev> dev_layer;
|
||||
|
||||
// contexts where the model tensors metadata is stored
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
// the model memory buffers for the tensor data
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// model memory mapped files
|
||||
llama_mmaps mappings;
|
||||
|
||||
// objects representing data potentially being locked in memory
|
||||
llama_mlocks mlock_bufs;
|
||||
llama_mlocks mlock_mmaps;
|
||||
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements = 0;
|
||||
explicit llama_model(const struct llama_model_params & params);
|
||||
~llama_model();
|
||||
|
||||
// total size of all the tensors in the model in bytes
|
||||
size_t n_bytes = 0;
|
||||
void load_stats (llama_model_loader & ml);
|
||||
void load_arch (llama_model_loader & ml);
|
||||
void load_hparams(llama_model_loader & ml);
|
||||
void load_vocab (llama_model_loader & ml);
|
||||
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
|
||||
|
||||
std::string arch_name() const;
|
||||
std::string type_name() const;
|
||||
|
||||
std::string desc() const;
|
||||
|
||||
size_t size() const;
|
||||
size_t max_nodes() const;
|
||||
size_t n_devices() const;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements() const;
|
||||
|
||||
void print_info() const;
|
||||
|
||||
ggml_backend_dev_t dev_layer(int il) const;
|
||||
ggml_backend_dev_t dev_output() const;
|
||||
|
||||
ggml_backend_buffer_type_t select_buft(int il) const;
|
||||
|
||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
const char * llm_type_name(llm_type type);
|
||||
|
||||
std::string llama_model_arch_name (const llama_model & model);
|
||||
std::string llama_model_type_name (const llama_model & model);
|
||||
std::string llama_model_ftype_name(const llama_model & model);
|
||||
|
||||
// used by llama_adapter_cvec
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
|
||||
|
||||
// used by llama_adapter_lora
|
||||
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
|
||||
|
||||
size_t llama_model_max_nodes(const llama_model & model);
|
||||
|
||||
struct llama_model_loader;
|
||||
|
||||
// TODO: become llama_model methods
|
||||
void llm_load_stats (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_arch (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);
|
||||
|
@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
if (qs.model.type == MODEL_70B) {
|
||||
if (qs.model.type == LLM_TYPE_70B) {
|
||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||
@ -525,18 +525,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||
kv_overrides = v->data();
|
||||
}
|
||||
|
||||
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model;
|
||||
llm_load_arch (ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
llm_load_stats (ml, model);
|
||||
llama_model model(llama_model_default_params());
|
||||
|
||||
model.load_arch (ml);
|
||||
model.load_hparams(ml);
|
||||
model.load_stats (ml);
|
||||
|
||||
struct quantize_state_impl qs(model, params);
|
||||
|
||||
if (params->only_copy) {
|
||||
ftype = model.ftype;
|
||||
ftype = ml.ftype;
|
||||
}
|
||||
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
||||
if (params->imatrix) {
|
||||
|
@ -1663,8 +1663,8 @@ struct llama_sampler_dry {
|
||||
|
||||
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
|
||||
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
|
||||
for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
|
||||
std::string word = llama_detokenize(vocab, {token_id}, true);
|
||||
for (llama_token token_id = 0; token_id < (llama_token) vocab.n_vocab(); token_id++) {
|
||||
std::string word = vocab.detokenize({token_id}, true);
|
||||
if (word.find(str) != std::string::npos) {
|
||||
token_sequences.emplace(token_id, std::vector<llama_token>());
|
||||
} else {
|
||||
@ -1681,7 +1681,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
||||
}
|
||||
}
|
||||
if (match) {
|
||||
std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
|
||||
std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
|
||||
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
|
||||
tokenization.resize(max_tail_len);
|
||||
}
|
||||
@ -2153,7 +2153,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
float p_eog_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
||||
p_eog_sum += cur_p->data[i].p;
|
||||
} else {
|
||||
p_txt_sum += cur_p->data[i].p;
|
||||
@ -2175,7 +2175,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
float p_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
||||
p_sum += cur_p->data[i].p;
|
||||
|
||||
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||
@ -2203,17 +2203,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
continue;
|
||||
}
|
||||
|
||||
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
if (len0 < 0) {
|
||||
ctx->buf0.resize(len0);
|
||||
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
||||
assert(len0 > 0);
|
||||
}
|
||||
|
||||
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
if (len1 < 0) {
|
||||
ctx->buf1.resize(len1);
|
||||
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
||||
assert(len1 > 0);
|
||||
}
|
||||
|
||||
@ -2248,7 +2248,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
||||
|
||||
if (cur_p->data[i].p < thold && !is_eog) {
|
||||
continue;
|
||||
@ -2269,7 +2269,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
||||
if (n_non_eog == 0) {
|
||||
cur_p->size = 1;
|
||||
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
|
||||
cur_p->data[0].id = ctx->vocab->token_eot();
|
||||
cur_p->data[0].logit = 1.0f;
|
||||
|
||||
return;
|
||||
@ -2291,7 +2291,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
||||
|
||||
for (size_t i = 0; i < size_org; ++i) {
|
||||
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
||||
|
||||
if (cur_p->data[i].p < thold && !is_eog) {
|
||||
continue;
|
||||
|
1669
src/llama-vocab.cpp
1669
src/llama-vocab.cpp
File diff suppressed because it is too large
Load Diff
@ -4,74 +4,161 @@
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <memory>
|
||||
|
||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
struct llm_tokenizer;
|
||||
struct LLM_KV;
|
||||
struct llama_model_loader;
|
||||
|
||||
struct llama_vocab {
|
||||
using id = llama_token;
|
||||
using token = std::string;
|
||||
using tattr = llama_token_attr;
|
||||
|
||||
struct token_data {
|
||||
token text;
|
||||
float score;
|
||||
tattr attr;
|
||||
std::string text;
|
||||
float score;
|
||||
llama_token_attr attr;
|
||||
};
|
||||
|
||||
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
|
||||
llama_vocab();
|
||||
~llama_vocab();
|
||||
|
||||
void load(llama_model_loader & ml, const LLM_KV & kv);
|
||||
|
||||
enum llama_vocab_type get_type() const;
|
||||
enum llama_vocab_pre_type get_pre_type() const;
|
||||
|
||||
// TODO: how to deduplicate with llama_hparams.n_vocab ?
|
||||
uint32_t n_vocab() const;
|
||||
|
||||
std::string type_name() const;
|
||||
|
||||
bool is_normal (llama_token id) const;
|
||||
bool is_unknown (llama_token id) const;
|
||||
bool is_control (llama_token id) const;
|
||||
bool is_byte (llama_token id) const;
|
||||
bool is_user_defined(llama_token id) const;
|
||||
bool is_unused (llama_token id) const;
|
||||
bool is_eog (llama_token id) const;
|
||||
|
||||
uint8_t token_to_byte(llama_token id) const;
|
||||
llama_token byte_to_token(uint8_t ch) const;
|
||||
|
||||
llama_token text_to_token(const std::string & text) const;
|
||||
|
||||
const token_data & get_token_data(llama_token id) const;
|
||||
|
||||
const char * token_get_text (llama_token id) const;
|
||||
float token_get_score(llama_token id) const;
|
||||
llama_token_attr token_get_attr (llama_token id) const;
|
||||
|
||||
llama_token token_bos() const;
|
||||
llama_token token_eos() const;
|
||||
llama_token token_eot() const;
|
||||
llama_token token_eom() const;
|
||||
llama_token token_unk() const;
|
||||
llama_token token_cls() const;
|
||||
llama_token token_sep() const;
|
||||
llama_token token_nl () const;
|
||||
llama_token token_pad() const;
|
||||
|
||||
llama_token token_prefix() const;
|
||||
llama_token token_middle() const;
|
||||
llama_token token_suffix() const;
|
||||
|
||||
llama_token token_fim_pre() const;
|
||||
llama_token token_fim_suf() const;
|
||||
llama_token token_fim_mid() const;
|
||||
llama_token token_fim_pad() const;
|
||||
llama_token token_fim_rep() const;
|
||||
llama_token token_fim_sep() const;
|
||||
|
||||
bool add_space_prefix () const;
|
||||
bool add_bos_token () const;
|
||||
bool add_eos_token () const;
|
||||
bool ignore_merges () const;
|
||||
bool clean_spaces () const;
|
||||
bool remove_extra_whitespaces () const;
|
||||
bool escape_whitespaces () const;
|
||||
bool treat_whitespace_as_suffix() const;
|
||||
|
||||
int max_token_text_len() const;
|
||||
|
||||
void print_info() const;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||
|
||||
std::vector<llama_token> tokenize(
|
||||
std::string raw_text,
|
||||
bool add_special,
|
||||
bool parse_special = false) const;
|
||||
|
||||
int32_t tokenize(
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_special,
|
||||
bool parse_special) const;
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t token_to_piece(
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length,
|
||||
int32_t lstrip,
|
||||
bool special) const;
|
||||
|
||||
// use cached data
|
||||
const std::string & token_to_piece(llama_token token) const;
|
||||
|
||||
// check if token0 is contained as a prefix in token1
|
||||
bool token_is_prefix(
|
||||
llama_token token0,
|
||||
llama_token token1) const;
|
||||
|
||||
int32_t detokenize(
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special) const;
|
||||
|
||||
std::string detokenize(
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special) const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
|
||||
std::string token_to_piece_for_cache(
|
||||
llama_token token,
|
||||
bool special) const;
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
|
||||
int max_token_len = 0; // used for optimizing longest token search
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
|
||||
std::vector<id> cache_special_tokens;
|
||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
||||
|
||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||
|
||||
// default LLaMA special tokens
|
||||
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
|
||||
id special_bos_id = 1;
|
||||
id special_eos_id = 2;
|
||||
id special_eot_id = LLAMA_TOKEN_NULL;
|
||||
id special_eom_id = LLAMA_TOKEN_NULL;
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = LLAMA_TOKEN_NULL;
|
||||
id special_pad_id = LLAMA_TOKEN_NULL;
|
||||
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_bos_id = 1;
|
||||
llama_token special_eos_id = 2;
|
||||
llama_token special_eot_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_eom_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_unk_id = 0;
|
||||
llama_token special_sep_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_pad_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
||||
llama_token special_mask_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
id linefeed_id = 13;
|
||||
llama_token linefeed_id = 13;
|
||||
|
||||
// fim tokens
|
||||
id special_fim_pre_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_suf_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_mid_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_pad_id = LLAMA_TOKEN_NULL;
|
||||
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
||||
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
||||
|
||||
// set of all tokens that cause "end of generation"
|
||||
std::set<id> special_eog_ids;
|
||||
llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
|
||||
llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
||||
llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
||||
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = false;
|
||||
@ -82,101 +169,4 @@ struct llama_vocab {
|
||||
bool tokenizer_remove_extra_whitespaces = false;
|
||||
bool tokenizer_escape_whitespaces = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
llm_tokenizer * tokenizer = nullptr;
|
||||
|
||||
llama_vocab() = default;
|
||||
~llama_vocab();
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||
|
||||
void init_tokenizer();
|
||||
};
|
||||
|
||||
//
|
||||
// internal API
|
||||
//
|
||||
|
||||
// TODO: rename to llama_tokenize_impl
|
||||
// TODO: This should probably be in llama.h
|
||||
std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||
const llama_vocab & vocab,
|
||||
std::string raw_text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
// TODO: move the API below as member functions of llama_vocab
|
||||
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
|
||||
|
||||
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
||||
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
||||
|
||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
||||
|
||||
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
|
||||
|
||||
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
||||
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
||||
|
||||
int32_t llama_tokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_special,
|
||||
bool parse_special);
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t llama_token_to_piece_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length,
|
||||
int32_t lstrip,
|
||||
bool special);
|
||||
|
||||
// check if token0 is contained as a prefix in token1
|
||||
bool llama_token_is_prefix_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
llama_token token0,
|
||||
llama_token token1);
|
||||
|
||||
int32_t llama_detokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special);
|
||||
|
||||
std::string llama_detokenize(
|
||||
const struct llama_vocab & vocab,
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special);
|
||||
|
2738
src/llama.cpp
2738
src/llama.cpp
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user