mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-07 11:23:56 +01:00
llama : sync gguf-llama.cpp with latest llama.cpp (#2608)
* llama : sync gguf-llama.cpp with latest llama.cpp * minor : indentation + assert * llama : refactor gguf_buffer and gguf_ctx_buffer * llama : minor
This commit is contained in:
parent
6f64b6c0f8
commit
f00780b2ee
@ -8,14 +8,19 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
/*
|
|
||||||
|
#undef MIN
|
||||||
|
#undef MAX
|
||||||
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static std::string to_string(const T & val) {
|
static std::string to_string(const T & val) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << val;
|
ss << val;
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
|
void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
|
||||||
const int32_t n = val.size();
|
const int32_t n = val.size();
|
||||||
fout.write((const char *) &n, sizeof(n));
|
fout.write((const char *) &n, sizeof(n));
|
||||||
@ -377,28 +382,28 @@ bool gguf_ex_read_2(const std::string & fname) {
|
|||||||
|
|
||||||
struct gguf_file file(fname.c_str(), "rb");
|
struct gguf_file file(fname.c_str(), "rb");
|
||||||
gguf_mmap data_mmap(&file, 0, false);
|
gguf_mmap data_mmap(&file, 0, false);
|
||||||
|
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name(ctx, i);
|
const char * name = gguf_get_tensor_name(ctx, i);
|
||||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||||
|
|
||||||
cur->data = static_cast<char *>(data_mmap.addr) + offset;
|
cur->data = static_cast<char *>(data_mmap.addr) + offset;
|
||||||
|
|
||||||
// print first 10 elements
|
// print first 10 elements
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
|
|
||||||
printf("%s data[:10] : ", name);
|
printf("%s data[:10] : ", name);
|
||||||
|
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
||||||
for (int j = 0; j < 10; ++j) {
|
|
||||||
printf("%f ", data[j]);
|
printf("%f ", data[j]);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
|
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
|
||||||
|
|
||||||
ggml_free(ctx_data);
|
ggml_free(ctx_data);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
|
@ -38,6 +38,9 @@ struct ggml_metal_context;
|
|||||||
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||||
|
|
||||||
|
void * ggml_metal_host_malloc(size_t n);
|
||||||
|
void ggml_metal_host_free (void * data);
|
||||||
|
|
||||||
// set the number of command buffers to use
|
// set the number of command buffers to use
|
||||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
||||||
|
|
||||||
|
15
ggml-metal.m
15
ggml-metal.m
@ -224,6 +224,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void * ggml_metal_host_malloc(size_t n) {
|
||||||
|
void * data = NULL;
|
||||||
|
const int result = posix_memalign((void **) &data, getpagesize(), n);
|
||||||
|
if (result != 0) {
|
||||||
|
fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_metal_host_free(void * data) {
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
||||||
ctx->n_cb = n_cb;
|
ctx->n_cb = n_cb;
|
||||||
}
|
}
|
||||||
|
989
gguf-llama.cpp
989
gguf-llama.cpp
File diff suppressed because it is too large
Load Diff
28
gguf-llama.h
28
gguf-llama.h
@ -41,10 +41,6 @@
|
|||||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef LLAMA_DEFAULT_RMS_EPS
|
|
||||||
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
@ -74,12 +70,23 @@ extern "C" {
|
|||||||
|
|
||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
struct llama_context_params {
|
enum llama_log_level {
|
||||||
|
LLAMA_LOG_LEVEL_ERROR = 2,
|
||||||
|
LLAMA_LOG_LEVEL_WARN = 3,
|
||||||
|
LLAMA_LOG_LEVEL_INFO = 4
|
||||||
|
};
|
||||||
|
|
||||||
|
// Signature for logging events
|
||||||
|
// Note that text includes the new line character at the end for most events.
|
||||||
|
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||||
|
// if it exists.
|
||||||
|
// It might not exist for progress report where '.' is output repeatedly.
|
||||||
|
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
|
struct llama_context_params {
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
int32_t n_ctx; // text context
|
int32_t n_ctx; // text context
|
||||||
int32_t n_batch; // prompt processing batch size
|
int32_t n_batch; // prompt processing batch size
|
||||||
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
|
||||||
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
|
|
||||||
@ -96,6 +103,7 @@ extern "C" {
|
|||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||||
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||||
bool f16_kv; // use fp16 for KV cache
|
bool f16_kv; // use fp16 for KV cache
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
@ -129,7 +137,7 @@ extern "C" {
|
|||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
typedef struct llama_model_quantize_params {
|
typedef struct llama_model_quantize_params {
|
||||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
@ -182,6 +190,10 @@ extern "C" {
|
|||||||
int32_t n_eval;
|
int32_t n_eval;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Set callback for all future logging events.
|
||||||
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
LLAMA_API int llama_max_devices();
|
LLAMA_API int llama_max_devices();
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
|
97
gguf-util.h
97
gguf-util.h
@ -64,13 +64,6 @@ static std::string format(const char * fmt, ...) {
|
|||||||
return std::string(buf.data(), size);
|
return std::string(buf.data(), size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
static std::string to_string(const T & val) {
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << val;
|
|
||||||
return ss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: can we merge this one and gguf_context?
|
// TODO: can we merge this one and gguf_context?
|
||||||
struct gguf_file {
|
struct gguf_file {
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
// use FILE * so we don't have to re-open the file to mmap
|
||||||
@ -474,94 +467,4 @@ struct gguf_mlock {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
|
||||||
struct gguf_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
gguf_buffer() = default;
|
|
||||||
|
|
||||||
void resize(size_t len) {
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
free(addr);
|
|
||||||
int result = posix_memalign((void **) &addr, getpagesize(), len);
|
|
||||||
if (result == 0) {
|
|
||||||
memset(addr, 0, len);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
addr = NULL;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
delete[] addr;
|
|
||||||
addr = new uint8_t[len];
|
|
||||||
#endif
|
|
||||||
size = len;
|
|
||||||
}
|
|
||||||
|
|
||||||
~gguf_buffer() {
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
free(addr);
|
|
||||||
#else
|
|
||||||
delete[] addr;
|
|
||||||
#endif
|
|
||||||
addr = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// disable copy and move
|
|
||||||
gguf_buffer(const gguf_buffer&) = delete;
|
|
||||||
gguf_buffer(gguf_buffer&&) = delete;
|
|
||||||
gguf_buffer& operator=(const gguf_buffer&) = delete;
|
|
||||||
gguf_buffer& operator=(gguf_buffer&&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
struct gguf_ctx_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
bool is_cuda;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
gguf_ctx_buffer() = default;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
free();
|
|
||||||
|
|
||||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
|
||||||
if (addr) {
|
|
||||||
is_cuda = true;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// fall back to pageable memory
|
|
||||||
addr = new uint8_t[size];
|
|
||||||
is_cuda = false;
|
|
||||||
}
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
void free() {
|
|
||||||
if (addr) {
|
|
||||||
if (is_cuda) {
|
|
||||||
ggml_cuda_host_free(addr);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
delete[] addr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
addr = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
~gguf_ctx_buffer() {
|
|
||||||
free();
|
|
||||||
}
|
|
||||||
|
|
||||||
// disable copy and move
|
|
||||||
gguf_ctx_buffer(const gguf_ctx_buffer&) = delete;
|
|
||||||
gguf_ctx_buffer(gguf_ctx_buffer&&) = delete;
|
|
||||||
gguf_ctx_buffer& operator=(const gguf_ctx_buffer&) = delete;
|
|
||||||
gguf_ctx_buffer& operator=(gguf_ctx_buffer&&) = delete;
|
|
||||||
};
|
|
||||||
#else
|
|
||||||
typedef gguf_buffer gguf_ctx_buffer;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user