diff --git a/ggml.c b/ggml.c index 77a3d89f7..f4c34f5d1 100644 --- a/ggml.c +++ b/ggml.c @@ -813,6 +813,34 @@ typedef struct { } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); +void quantize_upgrade(enum ggml_type type, void* data, size_t size) { + + if (type == GGML_TYPE_Q4_0) { + + int qk = ggml_blck_size(type); + const size_t nb = size / sizeof(block_q4_0); + block_q4_0 *blk = (block_q4_0 *)data; + block_q4_0 new_blk; + + for (size_t i = 0; i < nb ; i++) { + for (size_t j = 0; j < qk/4; j++) + { + // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 + // new: d0, d_half, d1, d_half1 + uint8_t d1; + uint8_t d2; + + d1 = blk[i].qs[0 + j]; + d2 = blk[i].qs[qk/4 + j]; + + new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); + new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); + } + memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + } + } +} + // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { static const int qk = QK4_0; diff --git a/ggml.h b/ggml.h index 51a616c50..787f927cd 100644 --- a/ggml.h +++ b/ggml.h @@ -1086,6 +1086,7 @@ extern "C" { GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + GGML_API void quantize_upgrade(enum ggml_type type, void* data, size_t size); // // system info // diff --git a/llama.cpp b/llama.cpp index 4cbc8d6b6..fb231d56d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2074,7 +2074,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t new_size; llama_buffer work; - if (!quantize) { + if (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1 && quantize) { + if (tensor.type == GGML_TYPE_Q4_0 && quantized_type == GGML_TYPE_Q4_0) { + // convet + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + quantize_upgrade(new_type, new_data, new_size); + printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } + else if (tensor.type == GGML_TYPE_Q4_1 && quantized_type == GGML_TYPE_Q4_1) { + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + quantize_upgrade(new_type, new_data, new_size); + printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } + else { + throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type)); + } + } else if (!quantize) { new_type = tensor.type; new_data = tensor.data; new_size = tensor.size;