gguf : start implementing quantization (WIP)

This commit is contained in:
M. Yusuf Sarıgöz 2023-08-12 14:28:17 +03:00
parent c4f02b4f74
commit b2571af255
3 changed files with 14 additions and 6 deletions

View File

@ -393,7 +393,7 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
gguf: examples/gguf/gguf.cpp build-info.h ggml.o $(OBJS) gguf: examples/gguf/gguf.cpp build-info.h ggml.o gguf-llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
gguf-llama-simple: examples/gguf/gguf-llama-simple.cpp build-info.h ggml.o gguf-llama.o common.o $(OBJS) gguf-llama-simple: examples/gguf/gguf-llama-simple.cpp build-info.h ggml.o gguf-llama.o common.o $(OBJS)

View File

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "gguf-util.h" #include "gguf-util.h"
#include "gguf-llama.h"
#include <cstdio> #include <cstdio>
#include <cinttypes> #include <cinttypes>
@ -7,14 +8,14 @@
#include <sstream> #include <sstream>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
/*
template<typename T> template<typename T>
static std::string to_string(const T & val) { static std::string to_string(const T & val) {
std::stringstream ss; std::stringstream ss;
ss << val; ss << val;
return ss.str(); return ss.str();
} }
*/
void gguf_ex_write_str(std::ofstream & fout, const std::string & val) { void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
const int32_t n = val.size(); const int32_t n = val.size();
fout.write((const char *) &n, sizeof(n)); fout.write((const char *) &n, sizeof(n));
@ -414,7 +415,7 @@ int main(int argc, char ** argv) {
const std::string fname(argv[1]); const std::string fname(argv[1]);
const std::string mode (argv[2]); const std::string mode (argv[2]);
GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w"); GGML_ASSERT((mode == "r" || mode == "w" || mode == "q") && "mode must be r, w or q");
if (mode == "w") { if (mode == "w") {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file"); GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
@ -422,6 +423,9 @@ int main(int argc, char ** argv) {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
} else if (mode == "q") {
llama_model_quantize_params params = llama_model_quantize_default_params();
llama_model_quantize(fname.c_str(), "quant.gguf", &params);
} }
return 0; return 0;

View File

@ -738,15 +738,19 @@ struct gguf_file_saver {
info_offset = file.tell(); info_offset = file.tell();
size_t count = gguf_get_data_offset(fl->gguf_ctx) - info_offset; size_t count = gguf_get_data_offset(fl->gguf_ctx) - info_offset;
file.write_zeros(count); file.write_zeros(count);
printf("info_offset = %zu\n", info_offset);
file.seek(info_offset, SEEK_SET);
GGML_ASSERT(info_offset == file.tell());
} }
size_t write_tensor_info(llama_load_tensor & tensor) { size_t write_tensor_info(llama_load_tensor & tensor) {
size_t total_written = 0; size_t total_written = 0;
file.seek(0, info_offset); file.seek(info_offset, SEEK_SET);
GGML_ASSERT(info_offset == file.tell());
total_written += file.write_str(tensor.name); total_written += file.write_str(tensor.name);
int32_t n_dims = tensor.ne.size(); int32_t n_dims = tensor.ne.size();
file.write_i32(n_dims); total_written += file.write_i32(n_dims);
for (int32_t i = 0; i < n_dims; ++i) { for (int32_t i = 0; i < n_dims; ++i) {
total_written += file.write_i32(i); total_written += file.write_i32(i);
} }