gguf : use UNIX line ending

This commit is contained in:
Georgi Gerganov 2023-08-14 13:04:35 +03:00
parent 0c19ae70d5
commit 62490f1380
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
5 changed files with 5606 additions and 5606 deletions

View File

@ -1,54 +1,54 @@
GGUF_MAGIC = 0x47475546 GGUF_MAGIC = 0x47475546
GGUF_VERSION = 1 GGUF_VERSION = 1
GGUF_DEFAULT_ALIGNMENT = 32 GGUF_DEFAULT_ALIGNMENT = 32
# general # general
KEY_GENERAL_ARCHITECTURE = "general.architecture" KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version" KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
KEY_GENERAL_ALIGNMENT = "general.alignment" KEY_GENERAL_ALIGNMENT = "general.alignment"
KEY_GENERAL_NAME = "general.name" KEY_GENERAL_NAME = "general.name"
KEY_GENERAL_AUTHOR = "general.author" KEY_GENERAL_AUTHOR = "general.author"
KEY_GENERAL_URL = "general.url" KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description" KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_FILE_TYPE = "general.file_type" KEY_GENERAL_FILE_TYPE = "general.file_type"
KEY_GENERAL_LICENSE = "general.license" KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url" KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
# LLM # LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length" KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length" KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{llm}.block_count" KEY_LLM_BLOCK_COUNT = "{llm}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length" KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual" KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout" KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
# attention # attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count" KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv" KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias" KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv" KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon" KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon"
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon" KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon"
# RoPE # RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count" KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale" KEY_ROPE_SCALE = "{llm}.rope.scale"
# tokenization # tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens" KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores" KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges" KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id" KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id" KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id" KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id" KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id" KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json" KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id" KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id" KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id" KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.separator_token_id" KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.separator_token_id"
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id" KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"

File diff suppressed because it is too large Load Diff

View File

@ -1,449 +1,449 @@
#ifndef LLAMA_H #ifndef LLAMA_H
#define LLAMA_H #define LLAMA_H
#include "ggml.h" #include "ggml.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#else #else
#define LLAMA_MAX_DEVICES 1 #define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__) # if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD # ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport) # define LLAMA_API __declspec(dllexport)
# else # else
# define LLAMA_API __declspec(dllimport) # define LLAMA_API __declspec(dllimport)
# endif # endif
# else # else
# define LLAMA_API __attribute__ ((visibility ("default"))) # define LLAMA_API __attribute__ ((visibility ("default")))
# endif # endif
#else #else
# define LLAMA_API # define LLAMA_API
#endif #endif
#ifdef __GNUC__ #ifdef __GNUC__
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint))) # define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func # define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else #else
# define DEPRECATED(func, hint) func # define DEPRECATED(func, hint) func
#endif #endif
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif
#ifndef LLAMA_DEFAULT_RMS_EPS #ifndef LLAMA_DEFAULT_RMS_EPS
#define LLAMA_DEFAULT_RMS_EPS 5e-6f #define LLAMA_DEFAULT_RMS_EPS 5e-6f
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
// //
// C interface // C interface
// //
// TODO: show sample usage // TODO: show sample usage
// //
struct llama_model; struct llama_model;
struct llama_context; struct llama_context;
typedef int llama_token; typedef int llama_token;
typedef struct llama_token_data { typedef struct llama_token_data {
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token float logit; // log-odds of the token
float p; // probability of the token float p; // probability of the token
} llama_token_data; } llama_token_data;
typedef struct llama_token_data_array { typedef struct llama_token_data_array {
llama_token_data * data; llama_token_data * data;
size_t size; size_t size;
bool sorted; bool sorted;
} llama_token_data_array; } llama_token_data_array;
typedef void (*llama_progress_callback)(float progress, void *ctx); typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params { struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size int32_t n_batch; // prompt processing batch size
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams) float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency float rope_freq_base; // RoPE base frequency
float rope_freq_scale; // RoPE frequency scaling factor float rope_freq_scale; // RoPE frequency scaling factor
// called with a progress value between 0 and 1, pass NULL to disable // called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback; llama_progress_callback progress_callback;
// context pointer passed to the progress callback // context pointer passed to the progress callback
void * progress_callback_user_data; void * progress_callback_user_data;
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool low_vram; // if true, reduce VRAM usage at the cost of performance bool low_vram; // if true, reduce VRAM usage at the cost of performance
bool f16_kv; // use fp16 for KV cache bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only bool embedding; // embedding mode only
}; };
// model file types // model file types
enum llama_ftype { enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0, LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
}; };
// model quantization parameters // model quantization parameters
typedef struct llama_model_quantize_params { typedef struct llama_model_quantize_params {
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight bool quantize_output_tensor; // quantize output.weight
} llama_model_quantize_params; } llama_model_quantize_params;
// grammar types // grammar types
struct llama_grammar; struct llama_grammar;
// grammar element type // grammar element type
enum llama_gretype { enum llama_gretype {
// end of rule definition // end of rule definition
LLAMA_GRETYPE_END = 0, LLAMA_GRETYPE_END = 0,
// start of alternate definition for rule // start of alternate definition for rule
LLAMA_GRETYPE_ALT = 1, LLAMA_GRETYPE_ALT = 1,
// non-terminal element: reference to rule // non-terminal element: reference to rule
LLAMA_GRETYPE_RULE_REF = 2, LLAMA_GRETYPE_RULE_REF = 2,
// terminal element: character (code point) // terminal element: character (code point)
LLAMA_GRETYPE_CHAR = 3, LLAMA_GRETYPE_CHAR = 3,
// inverse char(s) ([^a], [^a-b] [^abc]) // inverse char(s) ([^a], [^a-b] [^abc])
LLAMA_GRETYPE_CHAR_NOT = 4, LLAMA_GRETYPE_CHAR_NOT = 4,
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z]) // be an inclusive range ([a-z])
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5, LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
// modifies a preceding LLAMA_GRETYPE_CHAR or // modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6, LLAMA_GRETYPE_CHAR_ALT = 6,
}; };
typedef struct llama_grammar_element { typedef struct llama_grammar_element {
enum llama_gretype type; enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID uint32_t value; // Unicode code point or rule ID
} llama_grammar_element; } llama_grammar_element;
// performance timing information // performance timing information
struct llama_timings { struct llama_timings {
double t_start_ms; double t_start_ms;
double t_end_ms; double t_end_ms;
double t_load_ms; double t_load_ms;
double t_sample_ms; double t_sample_ms;
double t_p_eval_ms; double t_p_eval_ms;
double t_eval_ms; double t_eval_ms;
int32_t n_sample; int32_t n_sample;
int32_t n_p_eval; int32_t n_p_eval;
int32_t n_eval; int32_t n_eval;
}; };
LLAMA_API int llama_max_devices(); LLAMA_API int llama_max_devices();
LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported(); LLAMA_API bool llama_mlock_supported();
// TODO: not great API - very likely to change // TODO: not great API - very likely to change
// Initialize the llama + ggml backend // Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations // If numa is true, use NUMA optimizations
// Call once at the start of the program // Call once at the start of the program
LLAMA_API void llama_backend_init(bool numa); LLAMA_API void llama_backend_init(bool numa);
// Call once at the end of the program - currently only used for MPI // Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(); LLAMA_API void llama_backend_free();
LLAMA_API int64_t llama_time_us(); LLAMA_API int64_t llama_time_us();
LLAMA_API struct llama_model * llama_load_model_from_file( LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model, const char * path_model,
struct llama_context_params params); struct llama_context_params params);
LLAMA_API void llama_free_model(struct llama_model * model); LLAMA_API void llama_free_model(struct llama_model * model);
LLAMA_API struct llama_context * llama_new_context_with_model( LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model, struct llama_model * model,
struct llama_context_params params); struct llama_context_params params);
// Frees all allocated memory // Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
// Returns 0 on success // Returns 0 on success
LLAMA_API int llama_model_quantize( LLAMA_API int llama_model_quantize(
const char * fname_inp, const char * fname_inp,
const char * fname_out, const char * fname_out,
const llama_model_quantize_params * params); const llama_model_quantize_params * params);
// Apply a LoRA adapter to a loaded model // Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for // path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model. // the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter // The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one // will be applied on top of the previous one
// Returns 0 on success // Returns 0 on success
LLAMA_API DEPRECATED(int llama_apply_lora_from_file( LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
struct llama_context * ctx, struct llama_context * ctx,
const char * path_lora, const char * path_lora,
const char * path_base_model, const char * path_base_model,
int n_threads), int n_threads),
"please use llama_model_apply_lora_from_file instead"); "please use llama_model_apply_lora_from_file instead");
LLAMA_API int llama_model_apply_lora_from_file( LLAMA_API int llama_model_apply_lora_from_file(
const struct llama_model * model, const struct llama_model * model,
const char * path_lora, const char * path_lora,
const char * path_base_model, const char * path_base_model,
int n_threads); int n_threads);
// Returns the number of tokens in the KV cache // Returns the number of tokens in the KV cache
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
// Sets the current rng seed. // Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
// Returns the maximum size in bytes of the state (rng, logits, embedding // Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens // and kv_cache) - will often be smaller after compacting tokens
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
// Copies the state to the specified destination address. // Copies the state to the specified destination address.
// Destination needs to have allocated enough memory. // Destination needs to have allocated enough memory.
// Returns the number of bytes copied // Returns the number of bytes copied
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
// Set the state reading from the specified address // Set the state reading from the specified address
// Returns the number of bytes read // Returns the number of bytes read
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
// Save/load session file // Save/load session file
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token. // Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process // tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls // n_past is the number of tokens to use from previous eval calls
// Returns 0 on success // Returns 0 on success
LLAMA_API int llama_eval( LLAMA_API int llama_eval(
struct llama_context * ctx, struct llama_context * ctx,
const llama_token * tokens, const llama_token * tokens,
int n_tokens, int n_tokens,
int n_past, int n_past,
int n_threads); int n_threads);
// Same as llama_eval, but use float matrix input directly. // Same as llama_eval, but use float matrix input directly.
LLAMA_API int llama_eval_embd( LLAMA_API int llama_eval_embd(
struct llama_context * ctx, struct llama_context * ctx,
const float * embd, const float * embd,
int n_tokens, int n_tokens,
int n_past, int n_past,
int n_threads); int n_threads);
// Export a static computation graph for context of 511 and batch size of 1 // Export a static computation graph for context of 511 and batch size of 1
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
// parameters here to keep things simple // parameters here to keep things simple
// IMPORTANT: do not use for anything else other than debugging and testing! // IMPORTANT: do not use for anything else other than debugging and testing!
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
// Convert the provided text into tokens. // Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens. // The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens // Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned // Returns a negative number on failure - the number of tokens that would have been returned
// TODO: not sure if correct // TODO: not sure if correct
LLAMA_API int llama_tokenize( LLAMA_API int llama_tokenize(
struct llama_context * ctx, struct llama_context * ctx,
const char * text, const char * text,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
LLAMA_API int llama_tokenize_with_model( LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model, const struct llama_model * model,
const char * text, const char * text,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
// Get the vocabulary as output parameters. // Get the vocabulary as output parameters.
// Returns number of results. // Returns number of results.
LLAMA_API int llama_get_vocab( LLAMA_API int llama_get_vocab(
const struct llama_context * ctx, const struct llama_context * ctx,
const char * * strings, const char * * strings,
float * scores, float * scores,
int capacity); int capacity);
LLAMA_API int llama_get_vocab_from_model( LLAMA_API int llama_get_vocab_from_model(
const struct llama_model * model, const struct llama_model * model,
const char * * strings, const char * * strings,
float * scores, float * scores,
int capacity); int capacity);
// Token logits obtained from the last call to llama_eval() // Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row // The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token // Can be mutated in order to change the probabilities of the next token
// Rows: n_tokens // Rows: n_tokens
// Cols: n_vocab // Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx); LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Get the embeddings for the input // Get the embeddings for the input
// shape: [n_embd] (1-dimensional) // shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context // Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str( LLAMA_API const char * llama_token_to_str(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token); llama_token token);
LLAMA_API const char * llama_token_to_str_with_model( LLAMA_API const char * llama_token_to_str_with_model(
const struct llama_model * model, const struct llama_model * model,
llama_token token); llama_token token);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(); // end-of-sentence LLAMA_API llama_token llama_token_eos(); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); // next-line LLAMA_API llama_token llama_token_nl(); // next-line
// Grammar // Grammar
// //
LLAMA_API struct llama_grammar * llama_grammar_init( LLAMA_API struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules, const llama_grammar_element ** rules,
size_t n_rules, size_t n_rules,
size_t start_rule_index); size_t start_rule_index);
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
// Sampling functions // Sampling functions
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
LLAMA_API void llama_sample_classifier_free_guidance( LLAMA_API void llama_sample_classifier_free_guidance(
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates, llama_token_data_array * candidates,
struct llama_context * guidance_ctx, struct llama_context * guidance_ctx,
float scale); float scale);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
/// @details Apply constraints from grammar /// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar); LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
/// @details Selects the token with the highest probability. /// @details Selects the token with the highest probability.
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Randomly selects a token from the candidates based on their probabilities. /// @details Randomly selects a token from the candidates based on their probabilities.
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Accepts the sampled token into the grammar /// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token); LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
// Performance information // Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx); LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information // Print system information
LLAMA_API const char * llama_print_system_info(void); LLAMA_API const char * llama_print_system_info(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL #ifdef LLAMA_API_INTERNAL
#include <vector> #include <vector>
#include <string> #include <string>
struct ggml_tensor; struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx); const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif #endif
#endif // LLAMA_H #endif // LLAMA_H

File diff suppressed because it is too large Load Diff

678
gguf.py
View File

@ -1,339 +1,339 @@
"""TODOs """TODOs
1. Implement writers for known architectures, LLaMA in particular. 1. Implement writers for known architectures, LLaMA in particular.
2. Add docstrings from the format specs. 2. Add docstrings from the format specs.
3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org. 3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
""" """
import struct import struct
import constants import constants
from enum import IntEnum from enum import IntEnum
from typing import Any, IO, List from typing import Any, IO, List
import numpy as np import numpy as np
import sys import sys
class GGMLQuantizationType(IntEnum): class GGMLQuantizationType(IntEnum):
F32 = 0 F32 = 0
F16 = 1 F16 = 1
class GGUFValueType(IntEnum): class GGUFValueType(IntEnum):
UINT8 = 0 UINT8 = 0
INT8 = 1 INT8 = 1
UINT16 = 2 UINT16 = 2
INT16 = 3 INT16 = 3
UINT32 = 4 UINT32 = 4
INT32 = 5 INT32 = 5
FLOAT32 = 6 FLOAT32 = 6
BOOL = 7 BOOL = 7
STRING = 8 STRING = 8
ARRAY = 9 ARRAY = 9
@staticmethod @staticmethod
def get_type(val): def get_type(val):
if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray): if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
return GGUFValueType.STRING return GGUFValueType.STRING
elif isinstance(val, list): elif isinstance(val, list):
return GGUFValueType.ARRAY return GGUFValueType.ARRAY
elif isinstance(val, float): elif isinstance(val, float):
return GGUFValueType.FLOAT32 return GGUFValueType.FLOAT32
elif isinstance(val, bool): elif isinstance(val, bool):
return GGUFValueType.BOOL return GGUFValueType.BOOL
elif isinstance(val, int): elif isinstance(val, int):
return GGUFValueType.INT32 return GGUFValueType.INT32
else: else:
print("Unknown type: "+str(type(val))) print("Unknown type: "+str(type(val)))
sys.exit() sys.exit()
class GGUFWriter: class GGUFWriter:
def __init__(self, fout: IO): def __init__(self, fout: IO):
self.fout = fout self.fout = fout
self.offset_tensor = 0 self.offset_tensor = 0
self.data_alignment = constants.GGUF_DEFAULT_ALIGNMENT self.data_alignment = constants.GGUF_DEFAULT_ALIGNMENT
self.kv_data = b"" self.kv_data = b""
self.kv_data_count = 0 self.kv_data_count = 0
self.ti_data = b"" self.ti_data = b""
self.ti_data_count = 0 self.ti_data_count = 0
def write_header_to_file(self): def write_header_to_file(self):
self.fout.write(struct.pack("<I", constants.GGUF_MAGIC)) self.fout.write(struct.pack("<I", constants.GGUF_MAGIC))
self.fout.write(struct.pack("<I", constants.GGUF_VERSION)) self.fout.write(struct.pack("<I", constants.GGUF_VERSION))
self.fout.write(struct.pack("<I", self.ti_data_count)) self.fout.write(struct.pack("<I", self.ti_data_count))
self.fout.write(struct.pack("<I", self.kv_data_count)) self.fout.write(struct.pack("<I", self.kv_data_count))
self.flush() self.flush()
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count)) # print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
def write_kv_data_to_file(self): def write_kv_data_to_file(self):
self.fout.write(self.kv_data) self.fout.write(self.kv_data)
self.flush() self.flush()
def write_ti_data_to_file(self): def write_ti_data_to_file(self):
self.fout.write(self.ti_data) self.fout.write(self.ti_data)
self.flush() self.flush()
@classmethod @classmethod
def open(cls, path: str) -> "GGUFWriter": def open(cls, path: str) -> "GGUFWriter":
f = open(path, "wb") f = open(path, "wb")
return cls(f) return cls(f)
def add_key(self, key: str): def add_key(self, key: str):
self.add_val(key, GGUFValueType.STRING, add_vtype=False) self.add_val(key, GGUFValueType.STRING, add_vtype=False)
def add_uint8(self, key: str, val: int): def add_uint8(self, key: str, val: int):
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.UINT8) self.add_val(val, GGUFValueType.UINT8)
def add_int8(self, key: str, val: int): def add_int8(self, key: str, val: int):
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.INT8) self.add_val(val, GGUFValueType.INT8)
def add_uint16(self, key: str, val: int): def add_uint16(self, key: str, val: int):
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.UINT16) self.add_val(val, GGUFValueType.UINT16)
def add_int16(self, key: str, val: int): def add_int16(self, key: str, val: int):
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.INT16) self.add_val(val, GGUFValueType.INT16)
def add_uint32(self, key: str, val: int): def add_uint32(self, key: str, val: int):
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.UINT32) self.add_val(val, GGUFValueType.UINT32)
def add_int32(self, key: str, val: int): def add_int32(self, key: str, val: int):
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.INT32) self.add_val(val, GGUFValueType.INT32)
def add_float32(self, key: str, val: float): def add_float32(self, key: str, val: float):
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.FLOAT32) self.add_val(val, GGUFValueType.FLOAT32)
def add_bool(self, key: str, val: bool): def add_bool(self, key: str, val: bool):
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.BOOL) self.add_val(val, GGUFValueType.BOOL)
def add_string(self, key: str, val: str): def add_string(self, key: str, val: str):
if len(val) == 0: return if len(val) == 0: return
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.STRING) self.add_val(val, GGUFValueType.STRING)
def add_array(self, key: str, val: list): def add_array(self, key: str, val: list):
if not isinstance(val, list): if not isinstance(val, list):
raise ValueError("Value must be a list for array type") raise ValueError("Value must be a list for array type")
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.ARRAY) self.add_val(val, GGUFValueType.ARRAY)
def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True): def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
if vtype is None: if vtype is None:
vtype = GGUFValueType.get_type(val) vtype = GGUFValueType.get_type(val)
if add_vtype: if add_vtype:
self.kv_data += struct.pack("<I", vtype) self.kv_data += struct.pack("<I", vtype)
self.kv_data_count += 1 self.kv_data_count += 1
if vtype == GGUFValueType.UINT8: if vtype == GGUFValueType.UINT8:
self.kv_data += struct.pack("<B", val) self.kv_data += struct.pack("<B", val)
elif vtype == GGUFValueType.INT8: elif vtype == GGUFValueType.INT8:
self.kv_data += struct.pack("<b", val) self.kv_data += struct.pack("<b", val)
elif vtype == GGUFValueType.UINT16: elif vtype == GGUFValueType.UINT16:
self.kv_data += struct.pack("<H", val) self.kv_data += struct.pack("<H", val)
elif vtype == GGUFValueType.INT16: elif vtype == GGUFValueType.INT16:
self.kv_data += struct.pack("<h", val) self.kv_data += struct.pack("<h", val)
elif vtype == GGUFValueType.UINT32: elif vtype == GGUFValueType.UINT32:
self.kv_data += struct.pack("<I", val) self.kv_data += struct.pack("<I", val)
elif vtype == GGUFValueType.INT32: elif vtype == GGUFValueType.INT32:
self.kv_data += struct.pack("<i", val) self.kv_data += struct.pack("<i", val)
elif vtype == GGUFValueType.FLOAT32: elif vtype == GGUFValueType.FLOAT32:
self.kv_data += struct.pack("<f", val) self.kv_data += struct.pack("<f", val)
elif vtype == GGUFValueType.BOOL: elif vtype == GGUFValueType.BOOL:
self.kv_data += struct.pack("?", val) self.kv_data += struct.pack("?", val)
elif vtype == GGUFValueType.STRING: elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") if isinstance(val, str) else val encoded_val = val.encode("utf8") if isinstance(val, str) else val
self.kv_data += struct.pack("<I", len(encoded_val)) self.kv_data += struct.pack("<I", len(encoded_val))
self.kv_data += encoded_val self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY: elif vtype == GGUFValueType.ARRAY:
ltype = set([GGUFValueType.get_type(item) for item in val]) ltype = set([GGUFValueType.get_type(item) for item in val])
assert len(ltype) == 1, "All items in a GGUF array should be of the same type" assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
self.kv_data += struct.pack("<I", list(ltype)[0]) self.kv_data += struct.pack("<I", list(ltype)[0])
self.kv_data += struct.pack("<I", len(val)) self.kv_data += struct.pack("<I", len(val))
for item in val: for item in val:
self.add_val(item, add_vtype=False) self.add_val(item, add_vtype=False)
else: else:
raise ValueError("Invalid GGUF metadata value type") raise ValueError("Invalid GGUF metadata value type")
@staticmethod @staticmethod
def ggml_pad(x: int, n: int) -> int: def ggml_pad(x: int, n: int) -> int:
return ((x + n - 1) // n) * n return ((x + n - 1) // n) * n
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int): def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<I", len(encoded_name)) self.ti_data += struct.pack("<I", len(encoded_name))
self.ti_data += encoded_name self.ti_data += encoded_name
n_dims = len(tensor_shape) n_dims = len(tensor_shape)
self.ti_data += struct.pack("<I", n_dims) self.ti_data += struct.pack("<I", n_dims)
for i in range(n_dims): for i in range(n_dims):
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i]) self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
self.ti_data += struct.pack("<I", dtype) self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor) self.ti_data += struct.pack("<Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1 self.ti_data_count += 1
def write_tensor_to_file(self, tensor: np.ndarray): def write_tensor_to_file(self, tensor: np.ndarray):
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell() pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
if pad != 0: if pad != 0:
self.fout.write(bytes([0] * pad)) self.fout.write(bytes([0] * pad))
tensor.tofile(self.fout) tensor.tofile(self.fout)
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
if pad != 0: if pad != 0:
self.fout.write(bytes([0] * pad)) self.fout.write(bytes([0] * pad))
def flush(self): def flush(self):
self.fout.flush() self.fout.flush()
def close(self): def close(self):
self.fout.close() self.fout.close()
def add_architecture(self, architecture: str): def add_architecture(self, architecture: str):
self.add_string(constants.KEY_GENERAL_ARCHITECTURE, self.add_string(constants.KEY_GENERAL_ARCHITECTURE,
architecture) architecture)
def add_author(self, author: str): def add_author(self, author: str):
self.add_string(constants.KEY_GENERAL_AUTHOR, author) self.add_string(constants.KEY_GENERAL_AUTHOR, author)
def add_url(self, url: str): def add_url(self, url: str):
self.add_string(constants.KEY_GENERAL_URL, url) self.add_string(constants.KEY_GENERAL_URL, url)
def add_description(self, description: str): def add_description(self, description: str):
self.add_string(constants.KEY_GENERAL_DESCRIPTION, description) self.add_string(constants.KEY_GENERAL_DESCRIPTION, description)
def add_file_type(self, file_type: str): def add_file_type(self, file_type: str):
self.add_string(constants.KEY_GENERAL_FILE_TYPE, file_type) self.add_string(constants.KEY_GENERAL_FILE_TYPE, file_type)
def add_source_url(self, url: str): def add_source_url(self, url: str):
self.add_string(constants.KEY_GENERAL_SOURCE_URL, url) self.add_string(constants.KEY_GENERAL_SOURCE_URL, url)
def add_source_hf_repo(self, repo: str): def add_source_hf_repo(self, repo: str):
self.add_string(constants.KEY_GENERAL_SOURCE_HF_REPO, repo) self.add_string(constants.KEY_GENERAL_SOURCE_HF_REPO, repo)
def add_name(self, name: str): def add_name(self, name: str):
self.add_string(constants.KEY_GENERAL_NAME, name) self.add_string(constants.KEY_GENERAL_NAME, name)
def add_quantization_version(self, quantization_version: GGMLQuantizationType): def add_quantization_version(self, quantization_version: GGMLQuantizationType):
self.add_uint32( self.add_uint32(
constants.KEY_GENERAL_QUANTIZATION_VERSION, quantization_version) constants.KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
def add_custom_alignment(self, alignment: int): def add_custom_alignment(self, alignment: int):
self.data_alignment = alignment self.data_alignment = alignment
self.add_uint32(constants.KEY_GENERAL_ALIGNMENT, alignment) self.add_uint32(constants.KEY_GENERAL_ALIGNMENT, alignment)
def add_context_length(self, llm: str, length: int): def add_context_length(self, llm: str, length: int):
self.add_uint32( self.add_uint32(
constants.KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length) constants.KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
def add_embedding_length(self, llm: str, length: int): def add_embedding_length(self, llm: str, length: int):
self.add_uint32( self.add_uint32(
constants.KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length) constants.KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
def add_block_count(self, llm: str, length: int): def add_block_count(self, llm: str, length: int):
self.add_uint32( self.add_uint32(
constants.KEY_LLM_BLOCK_COUNT.format(llm=llm), length) constants.KEY_LLM_BLOCK_COUNT.format(llm=llm), length)
def add_feed_forward_length(self, llm: str, length: int): def add_feed_forward_length(self, llm: str, length: int):
self.add_uint32( self.add_uint32(
constants.KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length) constants.KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
def add_parallel_residual(self, llm: str, use: bool): def add_parallel_residual(self, llm: str, use: bool):
self.add_bool( self.add_bool(
constants.KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use) constants.KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
def add_tensor_data_layout(self, llm: str, layout: str): def add_tensor_data_layout(self, llm: str, layout: str):
self.add_string( self.add_string(
constants.KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout) constants.KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
def add_head_count(self, llm: str, count: int): def add_head_count(self, llm: str, count: int):
self.add_uint32( self.add_uint32(
constants.KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count) constants.KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
def add_head_count_kv(self, llm: str, count: int): def add_head_count_kv(self, llm: str, count: int):
self.add_uint32( self.add_uint32(
constants.KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count) constants.KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
def add_max_alibi_bias(self, llm: str, bias: float): def add_max_alibi_bias(self, llm: str, bias: float):
self.add_float32( self.add_float32(
constants.KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias) constants.KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
def add_clamp_kqv(self, llm: str, value: float): def add_clamp_kqv(self, llm: str, value: float):
self.add_float32( self.add_float32(
constants.KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value) constants.KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
def add_layer_norm_eps(self, llm: str, value: float): def add_layer_norm_eps(self, llm: str, value: float):
self.add_float32( self.add_float32(
constants.KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value) constants.KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value)
def add_layer_norm_rms_eps(self, llm: str, value: float): def add_layer_norm_rms_eps(self, llm: str, value: float):
self.add_float32( self.add_float32(
constants.KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value) constants.KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value)
def add_rope_dimension_count(self, llm: str, count: int): def add_rope_dimension_count(self, llm: str, count: int):
self.add_uint32( self.add_uint32(
constants.KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count) constants.KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
def add_rope_scale(self, llm: str, value: float): def add_rope_scale(self, llm: str, value: float):
self.add_float32(constants.KEY_ROPE_SCALE.format(llm=llm), value) self.add_float32(constants.KEY_ROPE_SCALE.format(llm=llm), value)
def add_tokenizer_model(self, model: str): def add_tokenizer_model(self, model: str):
self.add_string(constants.KEY_TOKENIZER_MODEL, model) self.add_string(constants.KEY_TOKENIZER_MODEL, model)
def add_token_list(self, tokens: List): def add_token_list(self, tokens: List):
self.add_array(constants.KEY_TOKENIZER_LIST, tokens) self.add_array(constants.KEY_TOKENIZER_LIST, tokens)
def add_token_merges(self, merges: List): def add_token_merges(self, merges: List):
self.add_array(constants.KEY_TOKENIZER_MERGES, merges) self.add_array(constants.KEY_TOKENIZER_MERGES, merges)
def add_token_scores(self, scores: List[float]): def add_token_scores(self, scores: List[float]):
self.add_array(constants.KEY_TOKENIZER_SCORES, scores) self.add_array(constants.KEY_TOKENIZER_SCORES, scores)
def add_bos_token_id(self, id: int): def add_bos_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_BOS_ID, id) self.add_uint32(constants.KEY_TOKENIZER_BOS_ID, id)
def add_eos_token_id(self, id: int): def add_eos_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_EOS_ID, id) self.add_uint32(constants.KEY_TOKENIZER_EOS_ID, id)
def add_unk_token_id(self, id: int): def add_unk_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_UNK_ID, id) self.add_uint32(constants.KEY_TOKENIZER_UNK_ID, id)
def add_sep_token_id(self, id: int): def add_sep_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_SEP_ID, id) self.add_uint32(constants.KEY_TOKENIZER_SEP_ID, id)
def add_pad_token_id(self, id: int): def add_pad_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_PAD_ID, id) self.add_uint32(constants.KEY_TOKENIZER_PAD_ID, id)
# Example usage: # Example usage:
if __name__ == "__main__": if __name__ == "__main__":
# Example usage with a file # Example usage with a file
gguf_writer = GGUFWriter.open("example.gguf") gguf_writer = GGUFWriter.open("example.gguf")
gguf_writer.add_architecture("llama") gguf_writer.add_architecture("llama")
gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
gguf_writer.add_custom_alignment(64) gguf_writer.add_custom_alignment(64)
tensor1 = np.ones((32,), dtype=np.float32) * 100.0 tensor1 = np.ones((32,), dtype=np.float32) * 100.0
tensor2 = np.ones((32,), dtype=np.float32) * 101.0 tensor2 = np.ones((32,), dtype=np.float32) * 101.0
gguf_writer.add_tensor_info("tensor0", tensor1) gguf_writer.add_tensor_info("tensor0", tensor1)
gguf_writer.add_tensor_info("tensor1", tensor2) gguf_writer.add_tensor_info("tensor1", tensor2)
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
gguf_writer.write_ti_data_to_file() gguf_writer.write_ti_data_to_file()
gguf_writer.write_tensor_to_file(tensor1) gguf_writer.write_tensor_to_file(tensor1)
gguf_writer.write_tensor_to_file(tensor2) gguf_writer.write_tensor_to_file(tensor2)
gguf_writer.close() gguf_writer.close()