mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 14:20:31 +01:00
quantize: be able to specify the token embedding tensor type
This commit is contained in:
parent
7883796f71
commit
0e826d12a5
@ -221,6 +221,12 @@ int main(int argc, char ** argv) {
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||
|
@ -11987,8 +11987,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
}
|
||||
}
|
||||
} else if (name == "token_embd.weight") {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->token_embedding_type;
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
||||
@ -11997,6 +11999,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
||||
if (name.find("attn_v.weight") != std::string::npos) {
|
||||
@ -12892,6 +12895,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
/*.nthread =*/ 0,
|
||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
||||
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
||||
/*.allow_requantize =*/ false,
|
||||
/*.quantize_output_tensor =*/ true,
|
||||
/*.only_copy =*/ false,
|
||||
|
1
llama.h
1
llama.h
@ -278,6 +278,7 @@ extern "C" {
|
||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
enum ggml_type output_tensor_type; // output tensor type
|
||||
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
|
Loading…
Reference in New Issue
Block a user