mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-07 08:53:16 +01:00
Support more data types
This commit is contained in:
parent
d521d09380
commit
10cbc311e3
46
ggml.c
46
ggml.c
@ -822,7 +822,7 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t size) {
|
|||||||
block_q4_0 new_blk;
|
block_q4_0 new_blk;
|
||||||
|
|
||||||
for (size_t i = 0; i < nb ; i++) {
|
for (size_t i = 0; i < nb ; i++) {
|
||||||
for (size_t j = 0; j < qk/4; j++) {
|
for (int j = 0; j < qk/4; j++) {
|
||||||
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
||||||
// new: d0, d_half, d1, d_half1
|
// new: d0, d_half, d1, d_half1
|
||||||
uint8_t d1;
|
uint8_t d1;
|
||||||
@ -843,7 +843,49 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t size) {
|
|||||||
block_q4_1 new_blk;
|
block_q4_1 new_blk;
|
||||||
|
|
||||||
for (size_t i = 0; i < nb ; i++) {
|
for (size_t i = 0; i < nb ; i++) {
|
||||||
for (size_t j = 0; j < qk/4; j++) {
|
for (int j = 0; j < qk/4; j++) {
|
||||||
|
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
||||||
|
// new: d0, d_half, d1, d_half1
|
||||||
|
uint8_t d1;
|
||||||
|
uint8_t d2;
|
||||||
|
|
||||||
|
d1 = blk[i].qs[0 + j];
|
||||||
|
d2 = blk[i].qs[qk/4 + j];
|
||||||
|
|
||||||
|
new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4);
|
||||||
|
new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0);
|
||||||
|
}
|
||||||
|
memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs));
|
||||||
|
}
|
||||||
|
} else if (type == GGML_TYPE_Q5_0) {
|
||||||
|
int qk = ggml_blck_size(type);
|
||||||
|
const size_t nb = size / sizeof(block_q5_0);
|
||||||
|
block_q5_0 *blk = (block_q5_0 *)data;
|
||||||
|
block_q5_0 new_blk;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < nb ; i++) {
|
||||||
|
for (int j = 0; j < qk/4; j++) {
|
||||||
|
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
||||||
|
// new: d0, d_half, d1, d_half1
|
||||||
|
uint8_t d1;
|
||||||
|
uint8_t d2;
|
||||||
|
|
||||||
|
d1 = blk[i].qs[0 + j];
|
||||||
|
d2 = blk[i].qs[qk/4 + j];
|
||||||
|
|
||||||
|
new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4);
|
||||||
|
new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0);
|
||||||
|
}
|
||||||
|
memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs));
|
||||||
|
}
|
||||||
|
} else if (type == GGML_TYPE_Q5_1) {
|
||||||
|
int qk = ggml_blck_size(type);
|
||||||
|
const size_t nb = size / sizeof(block_q5_1);
|
||||||
|
block_q5_1 *blk = (block_q5_1 *)data;
|
||||||
|
block_q5_1 new_blk;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < nb ; i++) {
|
||||||
|
for (int j = 0; j < qk/4; j++) {
|
||||||
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
|
||||||
// new: d0, d_half, d1, d_half1
|
// new: d0, d_half, d1, d_half1
|
||||||
uint8_t d1;
|
uint8_t d1;
|
||||||
|
15
llama.cpp
15
llama.cpp
@ -2075,22 +2075,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
llama_buffer work;
|
llama_buffer work;
|
||||||
|
|
||||||
if (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1 && quantize) {
|
if (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1 && quantize) {
|
||||||
if (tensor.type == GGML_TYPE_Q4_0 && quantized_type == GGML_TYPE_Q4_0) {
|
if (((tensor.type == GGML_TYPE_Q4_0)
|
||||||
|
|| (tensor.type == GGML_TYPE_Q4_1)
|
||||||
|
|| (tensor.type == GGML_TYPE_Q5_0)
|
||||||
|
|| (tensor.type == GGML_TYPE_Q5_1)) && (quantized_type == tensor.type)) {
|
||||||
// convet
|
// convet
|
||||||
new_type = tensor.type;
|
new_type = tensor.type;
|
||||||
new_data = tensor.data;
|
new_data = tensor.data;
|
||||||
new_size = tensor.size;
|
new_size = tensor.size;
|
||||||
quantize_upgrade(new_type, new_data, new_size);
|
quantize_upgrade(new_type, new_data, new_size);
|
||||||
printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
||||||
}
|
} else {
|
||||||
else if (tensor.type == GGML_TYPE_Q4_1 && quantized_type == GGML_TYPE_Q4_1) {
|
|
||||||
new_type = tensor.type;
|
|
||||||
new_data = tensor.data;
|
|
||||||
new_size = tensor.size;
|
|
||||||
quantize_upgrade(new_type, new_data, new_size);
|
|
||||||
printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type));
|
throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type));
|
||||||
}
|
}
|
||||||
} else if (!quantize) {
|
} else if (!quantize) {
|
||||||
|
Loading…
Reference in New Issue
Block a user