mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-07 00:43:14 +01:00
Support V3 format upgrade
This commit is contained in:
parent
10cbc311e3
commit
006d5707e8
138
ggml.c
138
ggml.c
@ -813,91 +813,117 @@ typedef struct {
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
void quantize_upgrade(enum ggml_type type, void* data, size_t size) {
|
||||
|
||||
if (type == GGML_TYPE_Q4_0) {
|
||||
int qk = ggml_blck_size(type);
|
||||
const size_t nb = size / sizeof(block_q4_0);
|
||||
block_q4_0 *blk = (block_q4_0 *)data;
|
||||
block_q4_0 new_blk;
|
||||
|
||||
for (size_t i = 0; i < nb ; i++) {
|
||||
for (int j = 0; j < qk/4; j++) {
|
||||
static void quantize_shuffle_block(const uint8_t* src, uint8_t* dest, int half_size)
|
||||
{
|
||||
for (int j = 0; j < half_size; j++) {
|
||||
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
||||
// new: d0, d_half, d1, d_half1
|
||||
uint8_t d1;
|
||||
uint8_t d2;
|
||||
|
||||
d1 = blk[i].qs[0 + j];
|
||||
d2 = blk[i].qs[qk/4 + j];
|
||||
d1 = src[0 + j];
|
||||
d2 = src[half_size + j];
|
||||
|
||||
new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4);
|
||||
new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0);
|
||||
dest[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4);
|
||||
dest[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0);
|
||||
}
|
||||
memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs));
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
}block_q4_0_old;
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
float m; // min
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1_old;
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0_old;
|
||||
|
||||
void quantize_upgrade(enum ggml_type type, void* data, size_t * size, bool shuffle) {
|
||||
if (type == GGML_TYPE_Q4_0) {
|
||||
int qk = ggml_blck_size(type);
|
||||
const size_t nb = *size / sizeof(block_q4_0_old);
|
||||
block_q4_0_old *blk = (block_q4_0_old *)data;
|
||||
block_q4_0 *new_blk = (block_q4_0 *)data;
|
||||
block_q4_0 new_blk_buf;
|
||||
*size = nb * sizeof(block_q4_0);
|
||||
|
||||
for (size_t i = 0; i < nb ; i++) {
|
||||
|
||||
new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d);
|
||||
|
||||
if (shuffle)
|
||||
quantize_shuffle_block(blk[i].qs, new_blk_buf.qs, qk/4);
|
||||
else
|
||||
memcpy(new_blk_buf.qs, blk[i].qs, qk / 2);
|
||||
|
||||
memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q4_0));
|
||||
}
|
||||
} else if (type == GGML_TYPE_Q4_1) {
|
||||
int qk = ggml_blck_size(type);
|
||||
const size_t nb = size / sizeof(block_q4_1);
|
||||
block_q4_1 *blk = (block_q4_1 *)data;
|
||||
block_q4_1 new_blk;
|
||||
const size_t nb = *size / sizeof(block_q4_1_old);
|
||||
block_q4_1_old *blk = (block_q4_1_old *)data;
|
||||
block_q4_1 *new_blk = (block_q4_1 *)data;
|
||||
block_q4_1 new_blk_buf;
|
||||
*size = nb * sizeof(block_q4_1);
|
||||
|
||||
for (size_t i = 0; i < nb ; i++) {
|
||||
for (int j = 0; j < qk/4; j++) {
|
||||
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
||||
// new: d0, d_half, d1, d_half1
|
||||
uint8_t d1;
|
||||
uint8_t d2;
|
||||
new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d);
|
||||
new_blk_buf.m = GGML_FP32_TO_FP16(blk[i].m);
|
||||
|
||||
d1 = blk[i].qs[0 + j];
|
||||
d2 = blk[i].qs[qk/4 + j];
|
||||
|
||||
new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4);
|
||||
new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0);
|
||||
}
|
||||
memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs));
|
||||
if (shuffle)
|
||||
quantize_shuffle_block(blk[i].qs, new_blk_buf.qs, qk/4);
|
||||
else
|
||||
memcpy(new_blk_buf.qs, blk[i].qs, qk / 2);
|
||||
memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q4_1));
|
||||
}
|
||||
} else if (type == GGML_TYPE_Q5_0) {
|
||||
// No size diff
|
||||
int qk = ggml_blck_size(type);
|
||||
const size_t nb = size / sizeof(block_q5_0);
|
||||
const size_t nb = *size / sizeof(block_q5_0);
|
||||
block_q5_0 *blk = (block_q5_0 *)data;
|
||||
block_q5_0 new_blk;
|
||||
|
||||
for (size_t i = 0; i < nb ; i++) {
|
||||
for (int j = 0; j < qk/4; j++) {
|
||||
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
||||
// new: d0, d_half, d1, d_half1
|
||||
uint8_t d1;
|
||||
uint8_t d2;
|
||||
|
||||
d1 = blk[i].qs[0 + j];
|
||||
d2 = blk[i].qs[qk/4 + j];
|
||||
|
||||
new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4);
|
||||
new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0);
|
||||
}
|
||||
if (shuffle)
|
||||
quantize_shuffle_block(blk[i].qs, new_blk.qs, qk/4);
|
||||
else
|
||||
memcpy(new_blk.qs, blk[i].qs, qk / 2);
|
||||
memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs));
|
||||
}
|
||||
} else if (type == GGML_TYPE_Q5_1) {
|
||||
// No size diff
|
||||
int qk = ggml_blck_size(type);
|
||||
const size_t nb = size / sizeof(block_q5_1);
|
||||
const size_t nb = *size / sizeof(block_q5_1);
|
||||
block_q5_1 *blk = (block_q5_1 *)data;
|
||||
block_q5_1 new_blk;
|
||||
|
||||
for (size_t i = 0; i < nb ; i++) {
|
||||
for (int j = 0; j < qk/4; j++) {
|
||||
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
||||
// new: d0, d_half, d1, d_half1
|
||||
uint8_t d1;
|
||||
uint8_t d2;
|
||||
|
||||
d1 = blk[i].qs[0 + j];
|
||||
d2 = blk[i].qs[qk/4 + j];
|
||||
|
||||
new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4);
|
||||
new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0);
|
||||
if (shuffle)
|
||||
quantize_shuffle_block(blk[i].qs, new_blk.qs, qk/4);
|
||||
else
|
||||
memcpy(new_blk.qs, blk[i].qs, qk / 2);
|
||||
memcpy(&blk[i], &new_blk, sizeof(new_blk));
|
||||
}
|
||||
memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs));
|
||||
} else if (type == GGML_TYPE_Q8_0) {
|
||||
// no shuffle
|
||||
int qk = ggml_blck_size(type);
|
||||
const size_t nb = *size / sizeof(block_q8_0_old);
|
||||
block_q8_0_old *blk = (block_q8_0_old *)data;
|
||||
block_q8_0 *new_blk = (block_q8_0 *)data;
|
||||
block_q8_0 new_blk_buf;
|
||||
|
||||
*size = nb * sizeof(block_q8_0);
|
||||
|
||||
for (size_t i = 0; i < nb ; i++) {
|
||||
new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d);
|
||||
|
||||
memcpy(new_blk_buf.qs, blk[i].qs, qk / 2);
|
||||
memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q8_0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
2
ggml.h
2
ggml.h
@ -1086,7 +1086,7 @@ extern "C" {
|
||||
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
|
||||
GGML_API void quantize_upgrade(enum ggml_type type, void* data, size_t size);
|
||||
GGML_API void quantize_upgrade(enum ggml_type type, void* data, size_t *size, bool needShuffle);
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
75
llama.cpp
75
llama.cpp
@ -271,6 +271,14 @@ struct llama_context {
|
||||
}
|
||||
};
|
||||
|
||||
enum llama_file_version {
|
||||
LLAMA_FILE_VERSION_GGML,
|
||||
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
||||
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
||||
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
||||
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static T checked_mul(T a, T b) {
|
||||
T ret = a * b;
|
||||
@ -305,6 +313,28 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
|
||||
return size / ggml_blck_size(type);
|
||||
}
|
||||
|
||||
static size_t llama_calc_tensor_size_prev3(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
||||
size_t size = ggml_type_size(type);
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case GGML_TYPE_Q4_0:
|
||||
size += 2;
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
size += 4;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
size += 2;
|
||||
break;
|
||||
}
|
||||
|
||||
for (uint32_t dim : ne) {
|
||||
size = checked_mul<size_t>(size, dim);
|
||||
}
|
||||
return size / ggml_blck_size(type);
|
||||
}
|
||||
|
||||
struct llama_load_tensor_shard {
|
||||
std::vector<uint32_t> ne;
|
||||
size_t size;
|
||||
@ -312,8 +342,11 @@ struct llama_load_tensor_shard {
|
||||
size_t file_idx;
|
||||
size_t file_off;
|
||||
|
||||
void calc_size() {
|
||||
void calc_size(llama_file_version file_version) {
|
||||
if (file_version == LLAMA_FILE_VERSION_GGJT_V3)
|
||||
size = llama_calc_tensor_size(ne, type);
|
||||
else
|
||||
size = llama_calc_tensor_size_prev3(ne, type);
|
||||
}
|
||||
};
|
||||
|
||||
@ -336,11 +369,11 @@ struct llama_load_tensor {
|
||||
|
||||
llama_load_tensor(const std::string & name) : name(name) {}
|
||||
|
||||
void calc_all() {
|
||||
void calc_all(llama_file_version file_version) {
|
||||
calc_type();
|
||||
calc_split_type();
|
||||
calc_ne();
|
||||
calc_size();
|
||||
calc_size(file_version);
|
||||
}
|
||||
|
||||
void calc_type() {
|
||||
@ -392,8 +425,11 @@ struct llama_load_tensor {
|
||||
}
|
||||
}
|
||||
|
||||
void calc_size() {
|
||||
void calc_size(llama_file_version file_version) {
|
||||
if (file_version == LLAMA_FILE_VERSION_GGJT_V3)
|
||||
size = llama_calc_tensor_size(ne, type);
|
||||
else
|
||||
size = llama_calc_tensor_size_prev3(ne, type);
|
||||
}
|
||||
};
|
||||
|
||||
@ -403,14 +439,6 @@ struct llama_load_tensors_map {
|
||||
std::unordered_map<std::string, size_t> name_to_idx;
|
||||
};
|
||||
|
||||
enum llama_file_version {
|
||||
LLAMA_FILE_VERSION_GGML,
|
||||
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
||||
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
||||
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
||||
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
||||
};
|
||||
|
||||
struct llama_file_loader {
|
||||
llama_file file;
|
||||
llama_file_version file_version;
|
||||
@ -513,7 +541,7 @@ struct llama_file_loader {
|
||||
shard.file_idx = file_idx;
|
||||
shard.file_off = file.tell();
|
||||
|
||||
shard.calc_size();
|
||||
shard.calc_size(file_version);
|
||||
file.seek(shard.size, SEEK_CUR);
|
||||
|
||||
auto it = tensors_map.name_to_idx.find(name);
|
||||
@ -618,7 +646,7 @@ struct llama_model_loader {
|
||||
}
|
||||
this->use_mmap = use_mmap;
|
||||
for (llama_load_tensor & lt : tensors_map.tensors) {
|
||||
lt.calc_all();
|
||||
lt.calc_all(first_file->file_version);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2074,18 +2102,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
size_t new_size;
|
||||
llama_buffer work;
|
||||
|
||||
if (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1 && quantize) {
|
||||
if (((tensor.type == GGML_TYPE_Q4_0)
|
||||
|| (tensor.type == GGML_TYPE_Q4_1)
|
||||
|| (tensor.type == GGML_TYPE_Q5_0)
|
||||
|| (tensor.type == GGML_TYPE_Q5_1)) && (quantized_type == tensor.type)) {
|
||||
bool needShuffle = (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1);
|
||||
|
||||
if (model_loader->file_loaders.at(0)->file_version < LLAMA_FILE_VERSION_GGJT_V3 && quantize) {
|
||||
if ((quantized_type == tensor.type) &&
|
||||
(tensor.type == GGML_TYPE_Q4_0 || tensor.type == GGML_TYPE_Q4_1
|
||||
|| tensor.type == GGML_TYPE_Q5_0 || tensor.type == GGML_TYPE_Q5_1
|
||||
|| tensor.type == GGML_TYPE_Q8_0)) {
|
||||
// convet
|
||||
new_type = tensor.type;
|
||||
new_data = tensor.data;
|
||||
new_size = tensor.size;
|
||||
quantize_upgrade(new_type, new_data, new_size);
|
||||
printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
||||
} else {
|
||||
quantize_upgrade(new_type, new_data, &new_size, needShuffle);
|
||||
printf("Upgrade - size = %8.3f MB\n", new_size/1024.0/1024.0);
|
||||
}
|
||||
else {
|
||||
throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type));
|
||||
}
|
||||
} else if (!quantize) {
|
||||
|
Loading…
Reference in New Issue
Block a user