mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 14:20:31 +01:00
llama : arch (cont)
ggml-ci
This commit is contained in:
parent
52063f737d
commit
c8669a0e55
@ -68,12 +68,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
||||
cvec.tensors.reserve(model.hparams.n_layer);
|
||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
|
||||
[&](ggml_context * ctx) {
|
||||
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
return ggml_add(ctx, cur, layer_dir);
|
||||
});
|
||||
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||
|
1392
src/llama-arch.cpp
1392
src/llama-arch.cpp
File diff suppressed because it is too large
Load Diff
1323
src/llama-arch.h
1323
src/llama-arch.h
File diff suppressed because it is too large
Load Diff
@ -40,7 +40,7 @@ struct llama_data_write {
|
||||
}
|
||||
|
||||
void write_model_info(const struct llama_context * ctx) {
|
||||
std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
|
||||
const std::string arch_str = llm_arch_name(ctx->model.arch);
|
||||
write_string(arch_str);
|
||||
// TODO: add more model-specific info which should prevent loading the session file if not identical
|
||||
}
|
||||
@ -263,7 +263,8 @@ struct llama_data_read {
|
||||
|
||||
// validate model information
|
||||
void read_model_info(const struct llama_context * ctx) {
|
||||
std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
|
||||
const std::string cur_arch_str = llm_arch_name(ctx->model.arch);
|
||||
|
||||
std::string arch_str;
|
||||
read_string(arch_str);
|
||||
if (cur_arch_str != arch_str) {
|
||||
|
@ -1,5 +1,7 @@
|
||||
#include "llama-model.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||
@ -42,3 +44,49 @@ std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context_ptr ctx { ggml_init(params) };
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
|
||||
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||||
ggml_tensor * op_tensor = fn(ctx.get());
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op_tensor->src[i] != nullptr) {
|
||||
assert(op_tensor->src[i]->buffer == nullptr);
|
||||
op_tensor->src[i]->buffer = buf.get();
|
||||
}
|
||||
}
|
||||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
|
||||
return op_supported;
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||
for (const auto & cur : buft_list) {
|
||||
ggml_backend_dev_t cur_dev = cur.first;
|
||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||
return cur_buft;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error(format("no suitable buffer type found"));
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
|
||||
return select_buft(*model.dev_layer.at(il).buft_list,
|
||||
[&](ggml_context * ctx) {
|
||||
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
return ggml_add(ctx, cur, layer_dir);
|
||||
});
|
||||
}
|
||||
|
@ -5,8 +5,6 @@
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <array>
|
||||
@ -613,42 +611,6 @@ struct llama_model {
|
||||
}
|
||||
};
|
||||
|
||||
template<typename F>
|
||||
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context_ptr ctx { ggml_init(params) };
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
|
||||
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||||
ggml_tensor * op_tensor = fn(ctx.get());
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op_tensor->src[i] != nullptr) {
|
||||
assert(op_tensor->src[i]->buffer == nullptr);
|
||||
op_tensor->src[i]->buffer = buf.get();
|
||||
}
|
||||
}
|
||||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
|
||||
return op_supported;
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||
for (const auto & cur : buft_list) {
|
||||
ggml_backend_dev_t cur_dev = cur.first;
|
||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||
return cur_buft;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error(format("no suitable buffer type found"));
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
|
||||
|
||||
std::string llama_model_ftype_name(llama_ftype ftype);
|
||||
|
177
src/llama.cpp
177
src/llama.cpp
@ -2975,7 +2975,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
|
||||
// hparams
|
||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llm_arch_name(model.arch));
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
||||
@ -3092,158 +3092,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
}
|
||||
}
|
||||
|
||||
enum llm_tensor_layer {
|
||||
LLM_TENSOR_LAYER_INPUT,
|
||||
LLM_TENSOR_LAYER_REPEATING,
|
||||
LLM_TENSOR_LAYER_OUTPUT,
|
||||
};
|
||||
|
||||
struct llm_tensor_info {
|
||||
llm_tensor_layer layer;
|
||||
ggml_op op;
|
||||
};
|
||||
|
||||
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
{LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ROPE_FREQS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
|
||||
{LLM_TENSOR_ROPE_FACTORS_LONG, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
|
||||
{LLM_TENSOR_ROPE_FACTORS_SHORT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
|
||||
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_DEC_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ENC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ENC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ENC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ENC_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ENC_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ENC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_ENC_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_GATE_INP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_GATE_INP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_IN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_DECAY_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_DECAY_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_VALUE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_RECEPTANCE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_OUTPUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CHANNEL_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CHANNEL_MIX_VALUE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
|
||||
{LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
||||
{LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
|
||||
{LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CHANNEL_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_W, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
|
||||
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_NORM_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_LAYER_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_Q_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_KV_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_DEC_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_DEC_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ENC_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ENC_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_DEC_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_ENC_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
|
||||
{LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
|
||||
{LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
|
||||
{LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
};
|
||||
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(w != nullptr);
|
||||
@ -3618,11 +3466,12 @@ static bool llm_load_tensors(
|
||||
tn_tensor = LLM_TENSOR_OUTPUT;
|
||||
}
|
||||
|
||||
auto it = llm_tensor_info_mapping.find(tn_tensor);
|
||||
if (it == llm_tensor_info_mapping.end()) {
|
||||
llm_tensor_info info;
|
||||
try {
|
||||
info = llm_tensor_info_for(tn_tensor);
|
||||
} catch (const std::out_of_range & e) {
|
||||
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
||||
}
|
||||
const auto & info = it->second;
|
||||
|
||||
// tensors with "bias" suffix are always used with GGML_OP_ADD
|
||||
ggml_op op;
|
||||
@ -17283,9 +17132,12 @@ int32_t llama_detokenize(
|
||||
//
|
||||
|
||||
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
||||
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
|
||||
return LLM_CHAT_TEMPLATES.at(tmpl);
|
||||
try {
|
||||
return llm_chat_template_from_str(tmpl);
|
||||
} catch (const std::out_of_range &) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
|
||||
return tmpl.find(haystack) != std::string::npos;
|
||||
};
|
||||
@ -17799,15 +17651,6 @@ int32_t llama_chat_apply_template(
|
||||
return res;
|
||||
}
|
||||
|
||||
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
|
||||
auto it = LLM_CHAT_TEMPLATES.begin();
|
||||
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
|
||||
output[i] = it->first.c_str();
|
||||
std::advance(it, 1);
|
||||
}
|
||||
return (int32_t) LLM_CHAT_TEMPLATES.size();
|
||||
}
|
||||
|
||||
//
|
||||
// sampling
|
||||
//
|
||||
|
Loading…
Reference in New Issue
Block a user