llama_model_loader: support multiple split/shard GGUFs (#6187)

* split: support in llama_model_loader

* avoid copying the entire vector

Co-authored-by: slaren <slarengh@gmail.com>

* split: move llama_tensor_offset to llama_model_loader

* llama_model_loader: PR feedbacks:
 - use only one gguf_context for metadata only
 - store all ggml_context in a vector as the files and mappings
 - store all weights in a vector along with the source tensor
 - rename ctx_gguf to meta
 - rename ctx_meta to contexts

* avoid copying the entire vector

* Simplify this by making these optional, switch some layer creation tensor optional

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Handle optional tensors

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama_model_loader: fail if backend cannot allocate buffer

* fix mmap buffer management

* llama_model_loader: map file to backend buffer if the allocation succeeds only

* llama_model_loader: only map tensors included in the context

* llama_model_loader: minor, use same variable name for consistency, fix spacing in types cast

* llama_model_loader: fail if any of backend buffer cannot be allocated

* spacing

Co-authored-by: slaren <slarengh@gmail.com>

* fix loop over pointer

Co-authored-by: slaren <slarengh@gmail.com>

* llama_model_loader: if n_tensors declared not equals to loaded tensors in split, throw an exception instead of asserting

* llama_model_loader: ensure mappings vector has the expected size

* llama_model_loader:  use at instead of operator[] if this should never add to the map.

* llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size.

* llama_model_loader: be sure the model mappings has enough capacity before allocating backend buffer

* llama_model_loader: fix map -> unordered map

* llama_split_prefix: use a clearer version, not pass split path len but dest max len.

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* llama : minor

ggml-ci

* llama : introduce some typedef helpers

* docs: add model shard in hot topic

* llama_model_loader: put mapping in a unique_ptr from the moment it is allocated

Co-authored-by: slaren <slarengh@gmail.com>

* fix llama_split_prefix

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
This commit is contained in:
Pierrick Hymbert 2024-03-22 19:00:01 +01:00 committed by GitHub
parent ee804f6223
commit dba1af6129
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 412 additions and 224 deletions

View File

@ -22,6 +22,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
----

View File

@ -1,31 +1,34 @@
#include "llama.h"
#include "ggml.h"
#include "common.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <ios>
#include <string>
#include <vector>
#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <climits>
#include <stdexcept>
#if defined(_WIN32)
#include <windows.h>
#ifndef PATH_MAX
#define PATH_MAX MAX_PATH
#endif
#include <io.h>
#endif
enum split_operation : uint8_t {
SPLIT_OP_SPLIT,
SPLIT_OP_MERGE,
};
static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
static const int SPLIT_FILENAME_MAX = 256;
static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
struct split_params {
split_operation operation = SPLIT_OP_SPLIT;
@ -116,13 +119,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
try {
if (!split_params_parse_ex(argc, argv, params)) {
split_print_usage(argv[0]);
exit(1);
exit(EXIT_FAILURE);
}
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
split_print_usage(argv[0]);
exit(1);
exit(EXIT_FAILURE);
}
return result;
}
@ -134,12 +137,6 @@ static void zeros(std::ofstream & file, size_t n) {
}
}
static std::string split_file_name(const std::string & path, int i_split, int n_split) {
char f_split[SPLIT_FILENAME_MAX] = {0};
snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
return std::string(f_split);
}
struct split_strategy {
const split_params params;
std::ifstream & f_input;
@ -180,8 +177,9 @@ struct split_strategy {
if (i_split == 0) {
gguf_set_kv(ctx_out, ctx_gguf);
}
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
// populate the original tensors, so we get an initial metadata
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
@ -189,10 +187,11 @@ struct split_strategy {
gguf_add_tensor(ctx_out, meta);
}
auto split_name = split_file_name(params.output, i_split, n_split);
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
fout = std::ofstream(split_name, std::ios::binary);
fprintf(stderr, "%s: %s ...", __func__, split_path);
fout = std::ofstream(split_path, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
auto meta_size = gguf_get_meta_size(ctx_out);
@ -250,19 +249,23 @@ static void gguf_split(const split_params & split_params) {
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
if (!f_input.is_open()) {
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
exit(1);
exit(EXIT_FAILURE);
}
auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
exit(1);
exit(EXIT_FAILURE);
}
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
char first_split_path[PATH_MAX] = {0};
llama_split_path(first_split_path, sizeof(first_split_path),
split_params.output.c_str(), strategy.i_split, strategy.n_split);
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
__func__, split_params.input.c_str(),
split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
first_split_path,
split_params.n_split_tensors);
strategy.split_start();
@ -298,7 +301,9 @@ static void gguf_merge(const split_params & split_params) {
std::vector<ggml_context *> ctx_metas;
std::vector<gguf_context *> ctx_ggufs;
std::string split_prefix;
char split_path[PATH_MAX] = {0};
strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
char split_prefix[PATH_MAX] = {0};
// First pass to find KV and tensors metadata
for (int i_split = 0; i_split < n_split; i_split++) {
@ -309,89 +314,66 @@ static void gguf_merge(const split_params & split_params) {
/*.ctx = */ &ctx_meta,
};
auto split_name = split_params.input;
if (i_split > 0) {
split_name = split_file_name(split_prefix, i_split, n_split);
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
}
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
auto * ctx_gguf = gguf_init_from_file(split_path, params);
if (!ctx_gguf) {
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
exit(1);
exit(EXIT_FAILURE);
}
ctx_ggufs.push_back(ctx_gguf);
ctx_metas.push_back(ctx_meta);
if (i_split == 0) {
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split < 0) {
fprintf(stderr,
"\n%s: input file does not contain %s metadata\n",
__func__,
LLM_KV_GENERAL_SPLIT_N_SPLIT);
LLM_KV_SPLIT_COUNT);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(1);
exit(EXIT_FAILURE);
}
n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
if (n_split < 1) {
fprintf(stderr,
"\n%s: input file does not contain a valid split count %d\n",
__func__,
n_split);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(1);
exit(EXIT_FAILURE);
}
// Verify the file naming and extract split_prefix
if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
fprintf(stderr, "\n%s: unexpected input file name: %s"
" i_split=%d"
" n_split=%d\n", __func__,
split_path, i_split, n_split);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
// Do not trigger merge if we try to merge again the output
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
// Set metadata from the first split
gguf_set_kv(ctx_out, ctx_gguf);
}
// Verify the file naming
{
int i_split_file = 0;
int n_split_file = 0;
const char * i_split_format = "-00000-of-00000.gguf";
if (split_name.size() < strlen(i_split_format)) {
fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
for (auto * _ctx_gguf : ctx_ggufs) {
gguf_free(_ctx_gguf);
}
gguf_free(ctx_out);
fout.close();
exit(1);
}
split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
const char * split_name_c_str = split_name.c_str();
int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
fprintf(stderr, "\n%s: unexpected input file name: %s"
" i_split=%d i_split_file=%d"
" n_split=%d n_split_file=%d\n", __func__,
split_params.input.c_str(),
i_split, i_split_file,
n_split, n_split_file);
for (auto * _ctx_gguf : ctx_ggufs) {
gguf_free(_ctx_gguf);
}
gguf_free(ctx_out);
fout.close();
exit(1);
}
}
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
@ -411,18 +393,19 @@ static void gguf_merge(const split_params & split_params) {
// Write tensors data
for (int i_split = 0; i_split < n_split; i_split++) {
auto split_name = split_file_name(split_prefix, i_split, n_split);
std::ifstream f_input(split_name.c_str(), std::ios::binary);
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
std::ifstream f_input(split_path, std::ios::binary);
if (!f_input.is_open()) {
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str());
for (auto * _ctx_gguf : ctx_ggufs) {
gguf_free(_ctx_gguf);
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path);
for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
gguf_free(ctx_ggufs[i]);
ggml_free(ctx_metas[i]);
}
gguf_free(ctx_out);
fout.close();
exit(1);
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
auto * ctx_gguf = ctx_ggufs[i_split];
auto * ctx_meta = ctx_metas[i_split];
@ -481,8 +464,8 @@ int main(int argc, const char ** argv) {
break;
case SPLIT_OP_MERGE: gguf_merge(params);
break;
default:split_print_usage(argv[0]);
exit(1);
default: split_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return 0;

456
llama.cpp
View File

@ -52,6 +52,9 @@
#define NOMINMAX
#endif
#include <windows.h>
#ifndef PATH_MAX
#define PATH_MAX MAX_PATH
#endif
#include <io.h>
#endif
@ -290,6 +293,10 @@ enum llm_kv {
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_SPLIT_NO,
LLM_KV_SPLIT_COUNT,
LLM_KV_SPLIT_TENSORS_COUNT,
LLM_KV_SSM_INNER_SIZE,
LLM_KV_SSM_CONV_KERNEL,
LLM_KV_SSM_STATE_SIZE,
@ -355,6 +362,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
{ LLM_KV_SPLIT_NO, "split.no" },
{ LLM_KV_SPLIT_COUNT, "split.count" },
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
@ -1099,6 +1110,7 @@ struct llama_file {
}
}
};
using llama_files = std::vector<std::unique_ptr<llama_file>>;
struct llama_mmap {
void * addr;
@ -1299,6 +1311,7 @@ struct llama_mmap {
}
#endif
};
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
@ -1448,6 +1461,7 @@ struct llama_mlock {
static void raw_unlock(const void * addr, size_t len) {}
#endif
};
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
@ -2023,12 +2037,12 @@ struct llama_model {
// the model memory buffers for the tensor data
std::vector<ggml_backend_buffer_t> bufs;
// model memory mapped file
std::unique_ptr<llama_mmap> mapping;
// model memory mapped files
llama_mmaps mappings;
// objects representing data potentially being locked in memory
std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
llama_mlock mlock_mmap;
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@ -2792,6 +2806,8 @@ namespace GGUFMeta {
};
}
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
struct llama_model_loader {
int n_kv = 0;
int n_tensors = 0;
@ -2802,54 +2818,133 @@ struct llama_model_loader {
bool use_mmap = false;
llama_file file;
llama_files files;
llama_ftype ftype;
llama_fver fver;
std::unique_ptr<llama_mmap> mapping;
llama_mmaps mappings;
// Holds information on a model weights
struct llama_tensor_weights {
uint16_t idx; // source file index
size_t offs; // tensor data offset in the original file
ggml_tensor * tensor;
llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
}
};
std::vector<llama_tensor_weights> weights;
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
struct gguf_context * ctx_gguf = NULL;
struct ggml_context * ctx_meta = NULL;
struct gguf_context * meta = NULL;
std::vector<ggml_context *> contexts;
std::string arch_name;
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
int trace = 0;
if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE"));
}
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
if (param_overrides_p != nullptr) {
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
kv_overrides.insert({std::string(p->key), *p});
}
}
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) {
struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
meta = gguf_init_from_file(fname.c_str(), params);
if (!meta) {
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
}
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
n_kv = gguf_get_n_kv(ctx_gguf);
n_tensors = gguf_get_n_tensors(ctx_gguf);
// Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur));
}
files.emplace_back(new llama_file(fname.c_str(), "rb"));
contexts.emplace_back(ctx);
fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
uint16_t n_split = 0;
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
for (int i = 0; i < n_tensors; i++) {
const char * name = gguf_get_tensor_name(ctx_gguf, i);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
n_elements += ggml_nelements(t);
n_bytes += ggml_nbytes(t);
// Load additional GGML contexts
if (n_split > 1) {
uint16_t idx = 0;
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
if (idx != 0) {
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
}
char split_prefix[PATH_MAX] = {0};
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
}
if (trace > 0) {
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
}
char split_path[PATH_MAX] = {0};
for (idx = 1; idx < n_split; idx++) {
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
struct gguf_init_params split_params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
if (!ctx_gguf) {
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
}
// Save tensors data offset info of the shard.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur));
}
files.emplace_back(new llama_file(split_path, "rb"));
contexts.emplace_back(ctx);
gguf_free(ctx_gguf);
}
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
// sanity check
{
const int n_tensors_loaded = (int) weights.size();
if (n_tensors != n_tensors_loaded) {
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
}
}
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split);
}
n_kv = gguf_get_n_kv(meta);
n_tensors = weights.size();
fver = (enum llama_fver) gguf_get_version(meta);
for (auto & w : weights) {
n_elements += ggml_nelements(w.tensor);
n_bytes += ggml_nbytes(w.tensor);
}
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@ -2864,7 +2959,8 @@ struct llama_model_loader {
enum ggml_type type_max = GGML_TYPE_F32;
for (int i = 0; i < n_tensors; i++) {
enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
const ggml_tensor * tensor = weights.at(i).tensor;
enum ggml_type type = tensor->type;
n_type[type]++;
@ -2874,8 +2970,8 @@ struct llama_model_loader {
}
if (trace > 0) {
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
const uint16_t sid = weights.at(i).idx;
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
}
}
@ -2911,22 +3007,23 @@ struct llama_model_loader {
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
{
const int kid = gguf_find_key(ctx_gguf, "general.file_type");
const int kid = gguf_find_key(meta, "general.file_type");
if (kid >= 0) {
ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
}
}
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx_gguf, i);
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
const char * name = gguf_get_key(meta, i);
const enum gguf_type type = gguf_get_kv_type(meta, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(ctx_gguf, i);
std::string value = gguf_kv_to_str(meta, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@ -2955,18 +3052,18 @@ struct llama_model_loader {
}
~llama_model_loader() {
if (ctx_gguf) {
gguf_free(ctx_gguf);
if (meta) {
gguf_free(meta);
}
if (ctx_meta) {
ggml_free(ctx_meta);
for (auto * ctx : contexts) {
ggml_free(ctx);
}
}
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
get_arr_n(const std::string & key, T & result, const bool required = true) {
const int kid = gguf_find_key(ctx_gguf, key.c_str());
const int kid = gguf_find_key(meta, key.c_str());
if (kid < 0) {
if (required) {
@ -2976,7 +3073,7 @@ struct llama_model_loader {
}
struct GGUFMeta::ArrayInfo arr_info =
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
result = arr_info.length;
@ -2996,7 +3093,7 @@ struct llama_model_loader {
const struct llama_model_kv_override * override =
it != kv_overrides.end() ? &it->second : nullptr;
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
if (required && !found) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@ -3019,20 +3116,33 @@ struct llama_model_loader {
}
const char * get_tensor_name(int i) const {
return gguf_get_tensor_name(ctx_gguf, i);
return weights.at(i).tensor->name;
}
const llama_tensor_weights & get_weights(const char * name) const {
for (const auto & weight : weights) {
if (strcmp(name, weight.tensor->name) == 0) {
return weight;
}
}
throw std::runtime_error(format("tensor %s not found", name));
}
struct ggml_tensor * get_tensor_meta(const char * name) const {
return ggml_get_tensor(ctx_meta, name);
try {
return get_weights(name).tensor;
} catch (const std::runtime_error & e) {
return NULL;
}
}
struct ggml_tensor * get_tensor_meta(int i) const {
return get_tensor_meta(get_tensor_name(i));
}
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
ggml_set_name(tensor, ggml_get_name(meta));
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
ggml_set_name(tensor, ggml_get_name(cur));
n_created++;
@ -3040,7 +3150,7 @@ struct llama_model_loader {
}
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
if (cur == NULL) {
if (!required) {
@ -3075,76 +3185,79 @@ struct llama_model_loader {
}
}
size_t file_offset(const char * name) const {
const int idx = gguf_find_tensor(ctx_gguf, name);
if (idx < 0) {
throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
}
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
}
void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
// prefetch the whole file - all the data is needed anyway
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
if (use_mmap) {
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
mappings.reserve(files.size());
mmaps_used.reserve(files.size());
for (const auto & file : files) {
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
mmaps_used.emplace_back(std::make_pair(mapping->size, 0));
if (mlock_mmaps) {
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
mlock_mmap->init(mapping->addr);
mlock_mmaps->emplace_back(std::move(mlock_mmap));
}
mappings.emplace_back(std::move(mapping));
}
}
// compute the total size of all tensors for progress reporting
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
size_data += ggml_nbytes(cur);
}
if (use_mmap && mapping) {
if (lmlock) {
lmlock->init(mapping->addr);
}
mmap_used_first = mapping->size;
for (auto & w : weights) {
size_data += ggml_nbytes(w.tensor);
}
}
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
GGML_ASSERT(mapping);
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
GGML_ASSERT(!mappings.empty());
const auto & mapping = mappings.at(idx);
*first = mapping->size;
*last = 0;
*addr = mapping->addr;
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
const size_t offs = file_offset(ggml_get_name(tensor));
*first = std::min(*first, offs);
*last = std::max(*last, offs + ggml_nbytes(tensor));
const auto & w = get_weights(ggml_get_name(tensor));
if (w.idx != idx) {
continue;
}
*first = std::min(*first, w.offs);
*last = std::max(*last, w.offs + ggml_nbytes(tensor));
}
}
// for backwards compatibility, does not support ggml-backend
void load_data_for(struct ggml_tensor * cur) const {
const size_t offs = file_offset(ggml_get_name(cur));
const auto & w = get_weights(ggml_get_name(cur));
if (use_mmap && mapping) {
if (use_mmap) {
const auto & mapping = mappings.at(w.idx);
if (cur->data == nullptr) {
cur->data = (uint8_t *)mapping->addr + offs;
cur->data = (uint8_t *)mapping->addr + w.offs;
} else {
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
}
} else {
GGML_ASSERT(cur->data != nullptr);
file.seek(offs, SEEK_SET);
file.read_raw(cur->data, ggml_nbytes(cur));
GGML_ASSERT(w.idx < files.size());
const auto & file = files.at(w.idx);
file->seek(w.offs, SEEK_SET);
file->read_raw(cur->data, ggml_nbytes(cur));
}
}
size_t size_done = 0;
size_t size_data = 0;
size_t mmap_used_first = -1;
size_t mmap_used_last = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;
// Returns false if cancelled by progress_callback
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
GGML_ASSERT(size_data != 0 && "call init_mapping() first");
bool load_all_data(
struct ggml_context * ctx,
llama_buf_map & bufs_mmap,
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
std::vector<no_init<uint8_t>> read_buf;
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
if (progress_callback) {
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
@ -3152,41 +3265,57 @@ struct llama_model_loader {
}
}
const size_t offs = file_offset(ggml_get_name(cur));
const auto & w = get_weights(ggml_get_name(cur));
size_t n_size = ggml_nbytes(cur);
if (use_mmap && mapping) {
if (use_mmap) {
const auto & mapping = mappings.at(w.idx);
ggml_backend_buffer_t buf_mmap = nullptr;
if (bufs_mmap.count(w.idx)) {
buf_mmap = bufs_mmap.at(w.idx);
}
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
if (buf_mmap && cur->data == nullptr) {
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
if (lmlock) {
lmlock->grow_to(offs + ggml_nbytes(cur));
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + w.offs);
if (lmlocks) {
const auto & lmlock = lmlocks->at(w.idx);
lmlock->grow_to(w.offs + ggml_nbytes(cur));
}
mmap_used_first = std::min(mmap_used_first, offs);
mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
auto & mmap_used = mmaps_used[w.idx];
mmap_used.first = std::min(mmap_used.first, w.offs);
mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
} else {
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size);
}
} else {
GGML_ASSERT(w.idx < files.size());
const auto & file = files.at(w.idx);
if (ggml_backend_buffer_is_host(cur->buffer)) {
file.seek(offs, SEEK_SET);
file.read_raw(cur->data, ggml_nbytes(cur));
file->seek(w.offs, SEEK_SET);
file->read_raw(cur->data, ggml_nbytes(cur));
} else {
read_buf.resize(ggml_nbytes(cur));
file.seek(offs, SEEK_SET);
file.read_raw(read_buf.data(), ggml_nbytes(cur));
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
file->seek(w.offs, SEEK_SET);
file->read_raw(read_buf.data(), ggml_nbytes(cur));
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
}
}
size_done += ggml_nbytes(cur);
size_done += n_size;
}
// check if this is the last call and do final cleanup
if (size_done >= size_data) {
// unmap offloaded tensors and metadata
if (use_mmap && mapping) {
mapping->unmap_fragment(0, mmap_used_first);
if (mmap_used_last != 0) {
mapping->unmap_fragment(mmap_used_last, mapping->size);
if (use_mmap) {
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
const auto & mmap_used = mmaps_used.at(idx);
auto & mapping = mappings.at(idx);
mapping->unmap_fragment(0, mmap_used.first);
if (mmap_used.second != 0) {
mapping->unmap_fragment(mmap_used.second, mapping->size);
}
}
}
if (progress_callback) {
@ -3319,7 +3448,7 @@ static void llm_load_hparams(
llama_model_loader & ml,
llama_model & model) {
auto & hparams = model.hparams;
const gguf_context * ctx = ml.ctx_gguf;
const gguf_context * ctx = ml.meta;
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@ -3709,7 +3838,7 @@ static void llm_load_vocab(
llama_model & model) {
auto & vocab = model.vocab;
struct gguf_context * ctx = ml.ctx_gguf;
struct gguf_context * ctx = ml.meta;
const auto kv = LLM_KV(model.arch);
@ -4319,10 +4448,8 @@ static bool llm_load_tensors(
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
}
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@ -5024,23 +5151,41 @@ static bool llm_load_tensors(
ml.done_getting_tensors();
ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
ml.init_mappings(true, &model.mlock_mmaps);
model.mappings.reserve(ml.mappings.size());
// create the backend buffers
std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
ctx_bufs.reserve(ctx_map.size());
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
model.bufs.reserve(n_max_backend_buffer);
for (auto & it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
ggml_backend_buffer_t buf = nullptr;
llama_buf_map bufs;
bufs.reserve(n_max_backend_buffer);
// only the mmap region containing the tensors in the model is mapped to the backend buffer
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
void * addr = nullptr;
size_t first, last;
ml.get_mapping_range(&first, &last, ctx);
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
if (first >= last) {
continue;
}
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
if (buf == nullptr) {
throw std::runtime_error("unable to allocate backend CPU buffer");
}
model.bufs.push_back(buf);
bufs.emplace(idx, buf);
#ifdef GGML_USE_CUBLAS
if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer(
@ -5049,31 +5194,54 @@ static bool llm_load_tensors(
}
#endif
}
}
#ifdef GGML_USE_METAL
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
const size_t max_size = ggml_get_max_tensor_size(ctx);
void * addr = nullptr;
size_t first, last;
ml.get_mapping_range(&first, &last, ctx);
buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
if (first >= last) {
continue;
}
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
if (buf == nullptr) {
throw std::runtime_error("unable to allocate backend metal buffer");
}
model.bufs.push_back(buf);
bufs.emplace(idx, buf);
}
}
#endif
else {
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf == nullptr) {
throw std::runtime_error("unable to allocate backend buffer");
}
model.bufs.push_back(buf);
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
model.mlock_bufs.emplace_back(new llama_mlock);
auto & mlock_buf = model.mlock_bufs.back();
mlock_buf->init (ggml_backend_buffer_get_base(buf));
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
}
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
bufs.emplace(idx, buf);
}
if (buf == nullptr) {
}
if (bufs.empty()) {
throw std::runtime_error("failed to allocate buffer");
}
for (auto & buf : bufs) {
// indicate that this buffer contains weights
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.bufs.push_back(buf);
ctx_bufs.emplace_back(ctx, buf);
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
}
ctx_bufs.emplace_back(ctx, bufs);
}
if (llama_supports_gpu_offload()) {
@ -5105,13 +5273,15 @@ static bool llm_load_tensors(
// load tensor data
for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first;
ggml_backend_buffer_t buf = it.second;
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
return false;
}
}
model.mapping = std::move(ml.mapping);
for (auto & mapping : ml.mappings) {
model.mappings.emplace_back(std::move(mapping));
}
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
@ -12302,7 +12472,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
#endif
llama_model_loader ml(fname_inp, use_mmap, NULL);
ml.init_mapping(false); // no prefetching?
ml.init_mappings(false); // no prefetching?
llama_model model;
llm_load_arch(ml, model);
@ -12326,12 +12496,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
struct gguf_context * ctx_out = gguf_init_empty();
// copy the KV pairs from the input file
gguf_set_kv (ctx_out, ml.ctx_gguf);
gguf_set_kv (ctx_out, ml.meta);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * meta = ml.get_tensor_meta(i);
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
const std::string name = ggml_get_name(meta);
@ -12371,7 +12541,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// populate the original tensors so we get an initial meta data
for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * meta = ml.get_tensor_meta(i);
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
gguf_add_tensor(ctx_out, meta);
}
@ -12576,7 +12746,7 @@ static int llama_apply_lora_from_file_internal(
if (path_base_model) {
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
ml->init_mapping(/*prefetch*/ false); // no prefetching
ml->init_mappings(/*prefetch*/ false); // no prefetching
}
struct tensor_meta {
@ -12697,7 +12867,7 @@ static int llama_apply_lora_from_file_internal(
ggml_tensor * base_t;
if (ml) {
if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
if (!ml->get_tensor_meta(base_name.c_str())) {
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1;
}
@ -14645,6 +14815,30 @@ LLAMA_API int32_t llama_chat_apply_template(
return res;
}
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
return strlen(split_path);
}
return 0;
}
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
std::string str_split_path(split_path);
char postfix[32];
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
std::string str_postfix(postfix);
// check if dest ends with postfix
int size_prefix = str_split_path.size() - str_postfix.size();
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path);
return size_prefix;
}
return 0;
}
struct llama_timings llama_get_timings(struct llama_context * ctx) {
struct llama_timings result = {
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,

10
llama.h
View File

@ -960,6 +960,16 @@ extern "C" {
int32_t n_past,
int32_t n_predict);
/// @details Build a split GGUF final path for this chunk.
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
// Returns the split_path length.
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
// Returns the split_prefix length.
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
// Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);