From 10cbc311e3bc9e9fc80858808439f842eec50a27 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Thu, 18 May 2023 09:49:25 +0800 Subject: [PATCH] Support more data types --- ggml.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- llama.cpp | 15 +++++---------- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/ggml.c b/ggml.c index 1d8e5b6a9..700c36f24 100644 --- a/ggml.c +++ b/ggml.c @@ -822,7 +822,7 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t size) { block_q4_0 new_blk; for (size_t i = 0; i < nb ; i++) { - for (size_t j = 0; j < qk/4; j++) { + for (int j = 0; j < qk/4; j++) { // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 // new: d0, d_half, d1, d_half1 uint8_t d1; @@ -843,7 +843,49 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t size) { block_q4_1 new_blk; for (size_t i = 0; i < nb ; i++) { - for (size_t j = 0; j < qk/4; j++) { + for (int j = 0; j < qk/4; j++) { + // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 + // new: d0, d_half, d1, d_half1 + uint8_t d1; + uint8_t d2; + + d1 = blk[i].qs[0 + j]; + d2 = blk[i].qs[qk/4 + j]; + + new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); + new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); + } + memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + } + } else if (type == GGML_TYPE_Q5_0) { + int qk = ggml_blck_size(type); + const size_t nb = size / sizeof(block_q5_0); + block_q5_0 *blk = (block_q5_0 *)data; + block_q5_0 new_blk; + + for (size_t i = 0; i < nb ; i++) { + for (int j = 0; j < qk/4; j++) { + // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 + // new: d0, d_half, d1, d_half1 + uint8_t d1; + uint8_t d2; + + d1 = blk[i].qs[0 + j]; + d2 = blk[i].qs[qk/4 + j]; + + new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); + new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); + } + memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + } + } else if (type == GGML_TYPE_Q5_1) { + int qk = ggml_blck_size(type); + const size_t nb = size / sizeof(block_q5_1); + block_q5_1 *blk = (block_q5_1 *)data; + block_q5_1 new_blk; + + for (size_t i = 0; i < nb ; i++) { + for (int j = 0; j < qk/4; j++) { // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 // new: d0, d_half, d1, d_half1 uint8_t d1; diff --git a/llama.cpp b/llama.cpp index fb231d56d..f1a1acbf1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2075,22 +2075,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llama_buffer work; if (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1 && quantize) { - if (tensor.type == GGML_TYPE_Q4_0 && quantized_type == GGML_TYPE_Q4_0) { + if (((tensor.type == GGML_TYPE_Q4_0) + || (tensor.type == GGML_TYPE_Q4_1) + || (tensor.type == GGML_TYPE_Q5_0) + || (tensor.type == GGML_TYPE_Q5_1)) && (quantized_type == tensor.type)) { // convet new_type = tensor.type; new_data = tensor.data; new_size = tensor.size; quantize_upgrade(new_type, new_data, new_size); printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); - } - else if (tensor.type == GGML_TYPE_Q4_1 && quantized_type == GGML_TYPE_Q4_1) { - new_type = tensor.type; - new_data = tensor.data; - new_size = tensor.size; - quantize_upgrade(new_type, new_data, new_size); - printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); - } - else { + } else { throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type)); } } else if (!quantize) {