mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-06 02:48:57 +01:00
gguf : start implementing quantization (WIP)
This commit is contained in:
parent
c4f02b4f74
commit
b2571af255
2
Makefile
2
Makefile
@ -393,7 +393,7 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
|
||||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||
|
||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o $(OBJS)
|
||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o gguf-llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
gguf-llama-simple: examples/gguf/gguf-llama-simple.cpp build-info.h ggml.o gguf-llama.o common.o $(OBJS)
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf-util.h"
|
||||
#include "gguf-llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cinttypes>
|
||||
@ -7,14 +8,14 @@
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
/*
|
||||
template<typename T>
|
||||
static std::string to_string(const T & val) {
|
||||
std::stringstream ss;
|
||||
ss << val;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
*/
|
||||
void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
|
||||
const int32_t n = val.size();
|
||||
fout.write((const char *) &n, sizeof(n));
|
||||
@ -414,7 +415,7 @@ int main(int argc, char ** argv) {
|
||||
const std::string fname(argv[1]);
|
||||
const std::string mode (argv[2]);
|
||||
|
||||
GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
|
||||
GGML_ASSERT((mode == "r" || mode == "w" || mode == "q") && "mode must be r, w or q");
|
||||
|
||||
if (mode == "w") {
|
||||
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
|
||||
@ -422,6 +423,9 @@ int main(int argc, char ** argv) {
|
||||
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
||||
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
||||
GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
|
||||
} else if (mode == "q") {
|
||||
llama_model_quantize_params params = llama_model_quantize_default_params();
|
||||
llama_model_quantize(fname.c_str(), "quant.gguf", ¶ms);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -738,15 +738,19 @@ struct gguf_file_saver {
|
||||
info_offset = file.tell();
|
||||
size_t count = gguf_get_data_offset(fl->gguf_ctx) - info_offset;
|
||||
file.write_zeros(count);
|
||||
printf("info_offset = %zu\n", info_offset);
|
||||
file.seek(info_offset, SEEK_SET);
|
||||
GGML_ASSERT(info_offset == file.tell());
|
||||
}
|
||||
|
||||
size_t write_tensor_info(llama_load_tensor & tensor) {
|
||||
size_t total_written = 0;
|
||||
file.seek(0, info_offset);
|
||||
file.seek(info_offset, SEEK_SET);
|
||||
GGML_ASSERT(info_offset == file.tell());
|
||||
total_written += file.write_str(tensor.name);
|
||||
|
||||
int32_t n_dims = tensor.ne.size();
|
||||
file.write_i32(n_dims);
|
||||
total_written += file.write_i32(n_dims);
|
||||
for (int32_t i = 0; i < n_dims; ++i) {
|
||||
total_written += file.write_i32(i);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user