mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-28 04:47:04 +01:00
quantize: be able to specify the output tensor type
This commit is contained in:
parent
b2075fd6a5
commit
7883796f71
@ -189,6 +189,18 @@ static void prepare_imatrix(const std::string& imatrix_file,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ggml_type parse_ggml_type(const char * arg) {
|
||||||
|
ggml_type result = GGML_TYPE_COUNT;
|
||||||
|
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
|
||||||
|
auto type = ggml_type(j);
|
||||||
|
const auto * name = ggml_type_name(type);
|
||||||
|
if (name && strcmp(arg, name) == 0) {
|
||||||
|
result = type; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
@ -203,6 +215,12 @@ int main(int argc, char ** argv) {
|
|||||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||||
params.quantize_output_tensor = false;
|
params.quantize_output_tensor = false;
|
||||||
|
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
|
||||||
|
if (arg_idx < argc-1) {
|
||||||
|
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
|
||||||
|
} else {
|
||||||
|
usage(argv[0]);
|
||||||
|
}
|
||||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||||
params.allow_requantize = true;
|
params.allow_requantize = true;
|
||||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||||
|
@ -11971,6 +11971,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||||||
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
||||||
// with the quantization of the output tensor
|
// with the quantization of the output tensor
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
||||||
|
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||||
|
new_type = qs.params->output_tensor_type;
|
||||||
|
} else {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||||
new_type = GGML_TYPE_Q8_0;
|
new_type = GGML_TYPE_Q8_0;
|
||||||
@ -11982,6 +11985,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||||||
else if (new_type != GGML_TYPE_Q8_0) {
|
else if (new_type != GGML_TYPE_Q8_0) {
|
||||||
new_type = GGML_TYPE_Q6_K;
|
new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else if (name == "token_embd.weight") {
|
} else if (name == "token_embd.weight") {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
||||||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||||
@ -12887,6 +12891,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||||||
struct llama_model_quantize_params result = {
|
struct llama_model_quantize_params result = {
|
||||||
/*.nthread =*/ 0,
|
/*.nthread =*/ 0,
|
||||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||||
|
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
||||||
/*.allow_requantize =*/ false,
|
/*.allow_requantize =*/ false,
|
||||||
/*.quantize_output_tensor =*/ true,
|
/*.quantize_output_tensor =*/ true,
|
||||||
/*.only_copy =*/ false,
|
/*.only_copy =*/ false,
|
||||||
|
1
llama.h
1
llama.h
@ -277,6 +277,7 @@ extern "C" {
|
|||||||
typedef struct llama_model_quantize_params {
|
typedef struct llama_model_quantize_params {
|
||||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
|
enum ggml_type output_tensor_type; // output tensor type
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
|
Loading…
Reference in New Issue
Block a user