2025-01-03 09:18:53 +01:00
# include "llama-quant.h"
# include "llama-impl.h"
# include "llama-model.h"
# include "llama-model-loader.h"
# include <algorithm>
# include <cmath>
# include <cstring>
2025-01-08 15:19:36 +01:00
# include <cinttypes>
2025-01-03 09:18:53 +01:00
# include <fstream>
# include <mutex>
# include <thread>
# include <unordered_map>
static void zeros ( std : : ofstream & file , size_t n ) {
char zero = 0 ;
for ( size_t i = 0 ; i < n ; + + i ) {
file . write ( & zero , 1 ) ;
}
}
2025-01-06 09:52:01 +01:00
struct quantize_state_impl {
2025-01-03 09:18:53 +01:00
const llama_model & model ;
const llama_model_quantize_params * params ;
int n_attention_wv = 0 ;
int n_ffn_down = 0 ;
int n_ffn_gate = 0 ;
int n_ffn_up = 0 ;
int i_attention_wv = 0 ;
int i_ffn_down = 0 ;
int i_ffn_gate = 0 ;
int i_ffn_up = 0 ;
int n_k_quantized = 0 ;
int n_fallback = 0 ;
bool has_imatrix = false ;
// used to figure out if a model shares tok_embd with the output weight
bool has_output = false ;
2025-01-06 09:52:01 +01:00
quantize_state_impl ( const llama_model & model , const llama_model_quantize_params * params )
2025-01-03 09:18:53 +01:00
: model ( model )
, params ( params )
{ }
} ;
2025-01-06 09:52:01 +01:00
static void llama_tensor_dequantize_impl (
2025-01-03 09:18:53 +01:00
struct ggml_tensor * tensor , std : : vector < no_init < float > > & output , std : : vector < std : : thread > & workers ,
const size_t nelements , const int nthread
) {
if ( output . size ( ) < nelements ) {
output . resize ( nelements ) ;
}
float * f32_output = ( float * ) output . data ( ) ;
const ggml_type_traits * qtype = ggml_get_type_traits ( tensor - > type ) ;
if ( ggml_is_quantized ( tensor - > type ) ) {
if ( qtype - > to_float = = NULL ) {
throw std : : runtime_error ( format ( " type %s unsupported for integer quantization: no dequantization available " , ggml_type_name ( tensor - > type ) ) ) ;
}
} else if ( tensor - > type ! = GGML_TYPE_F16 & &
tensor - > type ! = GGML_TYPE_BF16 ) {
throw std : : runtime_error ( format ( " cannot dequantize/convert tensor type %s " , ggml_type_name ( tensor - > type ) ) ) ;
}
if ( nthread < 2 ) {
if ( tensor - > type = = GGML_TYPE_F16 ) {
ggml_fp16_to_fp32_row ( ( ggml_fp16_t * ) tensor - > data , f32_output , nelements ) ;
} else if ( tensor - > type = = GGML_TYPE_BF16 ) {
ggml_bf16_to_fp32_row ( ( ggml_bf16_t * ) tensor - > data , f32_output , nelements ) ;
} else if ( ggml_is_quantized ( tensor - > type ) ) {
qtype - > to_float ( tensor - > data , f32_output , nelements ) ;
} else {
GGML_ABORT ( " fatal error " ) ; // unreachable
}
return ;
}
size_t block_size ;
if ( tensor - > type = = GGML_TYPE_F16 | |
tensor - > type = = GGML_TYPE_BF16 ) {
block_size = 1 ;
} else {
block_size = ( size_t ) ggml_blck_size ( tensor - > type ) ;
}
size_t block_size_bytes = ggml_type_size ( tensor - > type ) ;
GGML_ASSERT ( nelements % block_size = = 0 ) ;
size_t nblocks = nelements / block_size ;
size_t blocks_per_thread = nblocks / nthread ;
size_t spare_blocks = nblocks - ( blocks_per_thread * nthread ) ; // if blocks aren't divisible by thread count
size_t in_buff_offs = 0 ;
size_t out_buff_offs = 0 ;
for ( int tnum = 0 ; tnum < nthread ; tnum + + ) {
size_t thr_blocks = blocks_per_thread + ( tnum = = nthread - 1 ? spare_blocks : 0 ) ; // num blocks for this thread
size_t thr_elems = thr_blocks * block_size ; // number of elements for this thread
size_t thr_block_bytes = thr_blocks * block_size_bytes ; // number of input bytes for this thread
auto compute = [ qtype ] ( ggml_type typ , uint8_t * inbuf , float * outbuf , int nels ) {
if ( typ = = GGML_TYPE_F16 ) {
ggml_fp16_to_fp32_row ( ( ggml_fp16_t * ) inbuf , outbuf , nels ) ;
} else if ( typ = = GGML_TYPE_BF16 ) {
ggml_bf16_to_fp32_row ( ( ggml_bf16_t * ) inbuf , outbuf , nels ) ;
} else {
qtype - > to_float ( inbuf , outbuf , nels ) ;
}
} ;
workers . emplace_back ( compute , tensor - > type , ( uint8_t * ) tensor - > data + in_buff_offs , f32_output + out_buff_offs , thr_elems ) ;
in_buff_offs + = thr_block_bytes ;
out_buff_offs + = thr_elems ;
}
for ( auto & w : workers ) { w . join ( ) ; }
workers . clear ( ) ;
}
2025-01-06 09:52:01 +01:00
static ggml_type llama_tensor_get_type ( quantize_state_impl & qs , ggml_type new_type , const ggml_tensor * tensor , llama_ftype ftype ) {
2025-01-03 09:18:53 +01:00
const std : : string name = ggml_get_name ( tensor ) ;
// TODO: avoid hardcoded tensor names - use the TN_* constants
const llm_arch arch = qs . model . arch ;
const auto tn = LLM_TN ( arch ) ;
auto use_more_bits = [ ] ( int i_layer , int n_layers ) - > bool {
return i_layer < n_layers / 8 | | i_layer > = 7 * n_layers / 8 | | ( i_layer - n_layers / 8 ) % 3 = = 2 ;
} ;
const int n_expert = std : : max ( 1 , ( int ) qs . model . hparams . n_expert ) ;
auto layer_info = [ n_expert ] ( int i_layer , int n_layer , const char * name ) {
if ( n_expert > 1 ) {
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
// for getting the current layer as I initially thought, and we need to resort to parsing the
// tensor name.
if ( sscanf ( name , " blk.%d. " , & i_layer ) ! = 1 ) {
throw std : : runtime_error ( format ( " Failed to determine layer for tensor %s " , name ) ) ;
}
if ( i_layer < 0 | | i_layer > = n_layer ) {
throw std : : runtime_error ( format ( " Bad layer %d for tensor %s. Must be in [0, %d) " , i_layer , name , n_layer ) ) ;
}
}
return std : : make_pair ( i_layer , n_layer ) ;
} ;
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
if ( name = = tn ( LLM_TENSOR_OUTPUT , " weight " ) | | ( ! qs . has_output & & name = = tn ( LLM_TENSOR_TOKEN_EMBD , " weight " ) ) ) {
if ( qs . params - > output_tensor_type < GGML_TYPE_COUNT ) {
new_type = qs . params - > output_tensor_type ;
} else {
2025-01-08 15:19:36 +01:00
const int64_t nx = tensor - > ne [ 0 ] ;
const int64_t qk_k = ggml_blck_size ( new_type ) ;
if ( arch = = LLM_ARCH_FALCON | | nx % qk_k ! = 0 ) {
2025-01-03 09:18:53 +01:00
new_type = GGML_TYPE_Q8_0 ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ2_XXS | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_XS | | ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XXS | |
ftype = = LLAMA_FTYPE_MOSTLY_IQ1_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_M | |
ftype = = LLAMA_FTYPE_MOSTLY_IQ1_M ) {
new_type = GGML_TYPE_Q5_K ;
}
else if ( new_type ! = GGML_TYPE_Q8_0 ) {
new_type = GGML_TYPE_Q6_K ;
}
}
} else if ( name = = " token_embd.weight " ) {
if ( qs . params - > token_embedding_type < GGML_TYPE_COUNT ) {
new_type = qs . params - > token_embedding_type ;
} else {
if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ2_XXS | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_XS | |
ftype = = LLAMA_FTYPE_MOSTLY_IQ1_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ1_M ) {
new_type = GGML_TYPE_Q2_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ2_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_M ) {
new_type = GGML_TYPE_IQ3_S ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XXS ) {
new_type = GGML_TYPE_IQ3_S ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_TQ1_0 | | ftype = = LLAMA_FTYPE_MOSTLY_TQ2_0 ) {
new_type = GGML_TYPE_Q4_K ;
}
}
} else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ2_XXS | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_XS | | ftype = = LLAMA_FTYPE_MOSTLY_IQ1_S | |
ftype = = LLAMA_FTYPE_MOSTLY_IQ2_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_M | | ftype = = LLAMA_FTYPE_MOSTLY_IQ1_M ) {
if ( name . find ( " attn_v.weight " ) ! = std : : string : : npos ) {
if ( qs . model . hparams . n_gqa ( ) > = 4 | | qs . model . hparams . n_expert > = 4 ) new_type = GGML_TYPE_Q4_K ;
else new_type = ftype = = LLAMA_FTYPE_MOSTLY_IQ2_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K ;
+ + qs . i_attention_wv ;
}
else if ( qs . model . hparams . n_expert = = 8 & & name . find ( " attn_k.weight " ) ! = std : : string : : npos ) {
new_type = GGML_TYPE_Q4_K ;
}
else if ( name . find ( " ffn_down " ) ! = std : : string : : npos ) {
if ( qs . i_ffn_down < qs . n_ffn_down / 8 ) {
new_type = ftype = = LLAMA_FTYPE_MOSTLY_IQ2_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K ;
}
+ + qs . i_ffn_down ;
}
else if ( name . find ( " attn_output.weight " ) ! = std : : string : : npos ) {
if ( qs . model . hparams . n_expert = = 8 ) {
new_type = GGML_TYPE_Q5_K ;
} else {
if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ1_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ1_M ) new_type = GGML_TYPE_IQ2_XXS ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ2_S | | ftype = = LLAMA_FTYPE_MOSTLY_IQ2_M ) new_type = GGML_TYPE_IQ3_S ;
}
}
} else if ( name . find ( " attn_v.weight " ) ! = std : : string : : npos ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K ) {
new_type = qs . model . hparams . n_gqa ( ) > = 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K_S & & qs . model . hparams . n_gqa ( ) > = 4 ) {
new_type = GGML_TYPE_Q4_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XXS ) {
new_type = qs . model . hparams . n_gqa ( ) > = 4 ? GGML_TYPE_Q4_K : ! qs . has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS ;
}
else if ( ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XS | | ftype = = LLAMA_FTYPE_MOSTLY_IQ3_S ) & & qs . model . hparams . n_gqa ( ) > = 4 ) {
new_type = GGML_TYPE_Q4_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_M ) {
new_type = GGML_TYPE_Q4_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M ) {
new_type = qs . i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K ;
else if ( ( ftype = = LLAMA_FTYPE_MOSTLY_IQ4_NL | | ftype = = LLAMA_FTYPE_MOSTLY_IQ4_XS ) & & qs . model . hparams . n_gqa ( ) > = 4 ) {
new_type = GGML_TYPE_Q5_K ;
}
else if ( ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_M | | ftype = = LLAMA_FTYPE_MOSTLY_Q5_K_M ) & &
use_more_bits ( qs . i_attention_wv , qs . n_attention_wv ) ) new_type = GGML_TYPE_Q6_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_S & & qs . i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K ;
if ( qs . model . type = = MODEL_70B ) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if ( new_type = = GGML_TYPE_Q3_K | | new_type = = GGML_TYPE_Q4_K ) new_type = GGML_TYPE_Q5_K ;
}
if ( qs . model . hparams . n_expert = = 8 ) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0 ;
}
+ + qs . i_attention_wv ;
} else if ( name . find ( " attn_k.weight " ) ! = std : : string : : npos ) {
if ( qs . model . hparams . n_expert = = 8 ) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0 ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XS ) {
new_type = GGML_TYPE_IQ3_XXS ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XXS ) {
new_type = GGML_TYPE_IQ2_S ;
}
} else if ( name . find ( " attn_q.weight " ) ! = std : : string : : npos ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XS ) {
new_type = GGML_TYPE_IQ3_XXS ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XXS ) {
new_type = GGML_TYPE_IQ2_S ;
}
} else if ( name . find ( " ffn_down " ) ! = std : : string : : npos ) {
auto info = layer_info ( qs . i_ffn_down , qs . n_ffn_down , name . c_str ( ) ) ;
int i_layer = info . first , n_layer = info . second ;
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K_S ) {
if ( i_layer < n_layer / 8 ) new_type = GGML_TYPE_Q4_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XXS & & ! qs . has_imatrix ) {
new_type = i_layer < n_layer / 8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M ) {
new_type = i_layer < n_layer / 16 ? GGML_TYPE_Q5_K
: arch ! = LLM_ARCH_FALCON | | use_more_bits ( i_layer , n_layer ) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_M & & ( i_layer < n_layer / 8 | |
( qs . model . hparams . n_expert = = 8 & & use_more_bits ( i_layer , n_layer ) ) ) ) {
new_type = GGML_TYPE_Q4_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) {
new_type = arch = = LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_M ) {
if ( arch = = LLM_ARCH_FALCON ) {
new_type = i_layer < n_layer / 16 ? GGML_TYPE_Q6_K :
use_more_bits ( i_layer , n_layer ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K ;
} else {
if ( use_more_bits ( i_layer , n_layer ) ) new_type = GGML_TYPE_Q6_K ;
}
}
else if ( i_layer < n_layer / 8 & & ( ftype = = LLAMA_FTYPE_MOSTLY_IQ4_NL | | ftype = = LLAMA_FTYPE_MOSTLY_IQ4_XS ) & & ! qs . has_imatrix ) {
new_type = GGML_TYPE_Q5_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q5_K_M & & use_more_bits ( i_layer , n_layer ) ) new_type = GGML_TYPE_Q6_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_S & & arch ! = LLM_ARCH_FALCON & & i_layer < n_layer / 8 ) {
new_type = GGML_TYPE_Q5_K ;
}
else if ( ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_0 | | ftype = = LLAMA_FTYPE_MOSTLY_Q5_0 )
& & qs . has_imatrix & & i_layer < n_layer / 8 ) {
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
new_type = ftype = = LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1 ;
}
+ + qs . i_ffn_down ;
} else if ( name . find ( " attn_output.weight " ) ! = std : : string : : npos ) {
if ( arch ! = LLM_ARCH_FALCON ) {
if ( qs . model . hparams . n_expert = = 8 ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K | | ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XS | | ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XXS | |
ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_S | | ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M | | ftype = = LLAMA_FTYPE_MOSTLY_IQ4_NL | |
ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_S | | ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_M | | ftype = = LLAMA_FTYPE_MOSTLY_IQ3_S | |
ftype = = LLAMA_FTYPE_MOSTLY_IQ3_M | | ftype = = LLAMA_FTYPE_MOSTLY_IQ4_XS ) {
new_type = GGML_TYPE_Q5_K ;
}
} else {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XXS ) new_type = GGML_TYPE_IQ3_S ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K ;
}
} else {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q4_K ;
}
}
else if ( name . find ( " attn_qkv.weight " ) ! = std : : string : : npos ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M | | ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L | | ftype = = LLAMA_FTYPE_MOSTLY_IQ3_M ) {
new_type = GGML_TYPE_Q4_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_M ) new_type = GGML_TYPE_Q5_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q5_K_M ) new_type = GGML_TYPE_Q6_K ;
}
else if ( name . find ( " ffn_gate " ) ! = std : : string : : npos ) {
auto info = layer_info ( qs . i_ffn_gate , qs . n_ffn_gate , name . c_str ( ) ) ;
int i_layer = info . first , n_layer = info . second ;
if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XS & & ( i_layer > = n_layer / 8 & & i_layer < 7 * n_layer / 8 ) ) {
new_type = GGML_TYPE_IQ3_XXS ;
}
+ + qs . i_ffn_gate ;
}
else if ( name . find ( " ffn_up " ) ! = std : : string : : npos ) {
auto info = layer_info ( qs . i_ffn_up , qs . n_ffn_up , name . c_str ( ) ) ;
int i_layer = info . first , n_layer = info . second ;
if ( ftype = = LLAMA_FTYPE_MOSTLY_IQ3_XS & & ( i_layer > = n_layer / 8 & & i_layer < 7 * n_layer / 8 ) ) {
new_type = GGML_TYPE_IQ3_XXS ;
}
+ + qs . i_ffn_up ;
}
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// This can be used to reduce the size of the Q5_K_S model.
// The associated PPL increase is fully in line with the size reduction
//else {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//}
bool convert_incompatible_tensor = false ;
2025-01-08 15:19:36 +01:00
{
const int64_t nx = tensor - > ne [ 0 ] ;
const int64_t ny = tensor - > ne [ 1 ] ;
const int64_t qk_k = ggml_blck_size ( new_type ) ;
if ( nx % qk_k ! = 0 ) {
LLAMA_LOG_WARN ( " \n \n %s : tensor cols % " PRId64 " x % " PRId64 " are not divisible by % " PRId64 " , required for %s " , __func__ , nx , ny , qk_k , ggml_type_name ( new_type ) ) ;
2025-01-03 09:18:53 +01:00
convert_incompatible_tensor = true ;
} else {
+ + qs . n_k_quantized ;
}
}
2025-01-08 15:19:36 +01:00
2025-01-03 09:18:53 +01:00
if ( convert_incompatible_tensor ) {
switch ( new_type ) {
case GGML_TYPE_TQ1_0 :
case GGML_TYPE_TQ2_0 : new_type = GGML_TYPE_Q4_0 ; break ; // TODO: use a symmetric type instead
case GGML_TYPE_IQ2_XXS :
case GGML_TYPE_IQ2_XS :
case GGML_TYPE_IQ2_S :
case GGML_TYPE_IQ3_XXS :
case GGML_TYPE_IQ3_S :
case GGML_TYPE_IQ1_S :
case GGML_TYPE_IQ1_M :
case GGML_TYPE_Q2_K :
case GGML_TYPE_Q3_K :
case GGML_TYPE_IQ4_XS : new_type = GGML_TYPE_IQ4_NL ; break ;
case GGML_TYPE_Q4_K : new_type = GGML_TYPE_Q5_0 ; break ;
case GGML_TYPE_Q5_K : new_type = GGML_TYPE_Q5_1 ; break ;
case GGML_TYPE_Q6_K : new_type = GGML_TYPE_Q8_0 ; break ;
default : throw std : : runtime_error ( " \n Unsupported tensor size encountered \n " ) ;
}
if ( tensor - > ne [ 0 ] % ggml_blck_size ( new_type ) ! = 0 ) {
new_type = GGML_TYPE_F16 ;
}
LLAMA_LOG_WARN ( " - using fallback quantization %s \n " , ggml_type_name ( new_type ) ) ;
+ + qs . n_fallback ;
}
return new_type ;
}
2025-01-06 09:52:01 +01:00
static size_t llama_tensor_quantize_impl ( enum ggml_type new_type , const float * f32_data , void * new_data , const int64_t chunk_size , int64_t nrows , int64_t n_per_row , const float * imatrix , std : : vector < std : : thread > & workers , const int nthread ) {
2025-01-03 09:18:53 +01:00
if ( nthread < 2 ) {
// single-thread
size_t new_size = ggml_quantize_chunk ( new_type , f32_data , new_data , 0 , nrows , n_per_row , imatrix ) ;
if ( ! ggml_validate_row_data ( new_type , new_data , new_size ) ) {
throw std : : runtime_error ( " quantized data validation failed " ) ;
}
return new_size ;
}
std : : mutex mutex ;
int64_t counter = 0 ;
size_t new_size = 0 ;
bool valid = true ;
auto compute = [ & mutex , & counter , & new_size , & valid , new_type , f32_data , new_data , chunk_size ,
nrows , n_per_row , imatrix ] ( ) {
const int64_t nrows_per_chunk = chunk_size / n_per_row ;
size_t local_size = 0 ;
while ( true ) {
std : : unique_lock < std : : mutex > lock ( mutex ) ;
int64_t first_row = counter ; counter + = nrows_per_chunk ;
if ( first_row > = nrows ) {
if ( local_size > 0 ) {
new_size + = local_size ;
}
break ;
}
lock . unlock ( ) ;
const int64_t this_nrow = std : : min ( nrows - first_row , nrows_per_chunk ) ;
size_t this_size = ggml_quantize_chunk ( new_type , f32_data , new_data , first_row * n_per_row , this_nrow , n_per_row , imatrix ) ;
local_size + = this_size ;
// validate the quantized data
const size_t row_size = ggml_row_size ( new_type , n_per_row ) ;
void * this_data = ( char * ) new_data + first_row * row_size ;
if ( ! ggml_validate_row_data ( new_type , this_data , this_size ) ) {
std : : unique_lock < std : : mutex > lock ( mutex ) ;
valid = false ;
break ;
}
}
} ;
for ( int it = 0 ; it < nthread - 1 ; + + it ) {
workers . emplace_back ( compute ) ;
}
compute ( ) ;
for ( auto & w : workers ) { w . join ( ) ; }
workers . clear ( ) ;
if ( ! valid ) {
throw std : : runtime_error ( " quantized data validation failed " ) ;
}
return new_size ;
}
2025-01-06 09:52:01 +01:00
static void llama_model_quantize_impl ( const std : : string & fname_inp , const std : : string & fname_out , const llama_model_quantize_params * params ) {
2025-01-03 09:18:53 +01:00
ggml_type default_type ;
llama_ftype ftype = params - > ftype ;
switch ( params - > ftype ) {
case LLAMA_FTYPE_MOSTLY_Q4_0 : default_type = GGML_TYPE_Q4_0 ; break ;
case LLAMA_FTYPE_MOSTLY_Q4_1 : default_type = GGML_TYPE_Q4_1 ; break ;
case LLAMA_FTYPE_MOSTLY_Q5_0 : default_type = GGML_TYPE_Q5_0 ; break ;
case LLAMA_FTYPE_MOSTLY_Q5_1 : default_type = GGML_TYPE_Q5_1 ; break ;
case LLAMA_FTYPE_MOSTLY_Q8_0 : default_type = GGML_TYPE_Q8_0 ; break ;
case LLAMA_FTYPE_MOSTLY_F16 : default_type = GGML_TYPE_F16 ; break ;
case LLAMA_FTYPE_MOSTLY_BF16 : default_type = GGML_TYPE_BF16 ; break ;
case LLAMA_FTYPE_ALL_F32 : default_type = GGML_TYPE_F32 ; break ;
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K_S :
case LLAMA_FTYPE_MOSTLY_Q2_K : default_type = GGML_TYPE_Q2_K ; break ;
case LLAMA_FTYPE_MOSTLY_IQ3_XS : default_type = GGML_TYPE_IQ3_S ; break ;
case LLAMA_FTYPE_MOSTLY_Q3_K_S :
case LLAMA_FTYPE_MOSTLY_Q3_K_M :
case LLAMA_FTYPE_MOSTLY_Q3_K_L : default_type = GGML_TYPE_Q3_K ; break ;
case LLAMA_FTYPE_MOSTLY_Q4_K_S :
case LLAMA_FTYPE_MOSTLY_Q4_K_M : default_type = GGML_TYPE_Q4_K ; break ;
case LLAMA_FTYPE_MOSTLY_Q5_K_S :
case LLAMA_FTYPE_MOSTLY_Q5_K_M : default_type = GGML_TYPE_Q5_K ; break ;
case LLAMA_FTYPE_MOSTLY_Q6_K : default_type = GGML_TYPE_Q6_K ; break ;
case LLAMA_FTYPE_MOSTLY_TQ1_0 : default_type = GGML_TYPE_TQ1_0 ; break ;
case LLAMA_FTYPE_MOSTLY_TQ2_0 : default_type = GGML_TYPE_TQ2_0 ; break ;
case LLAMA_FTYPE_MOSTLY_IQ2_XXS : default_type = GGML_TYPE_IQ2_XXS ; break ;
case LLAMA_FTYPE_MOSTLY_IQ2_XS : default_type = GGML_TYPE_IQ2_XS ; break ;
case LLAMA_FTYPE_MOSTLY_IQ2_S : default_type = GGML_TYPE_IQ2_XS ; break ;
case LLAMA_FTYPE_MOSTLY_IQ2_M : default_type = GGML_TYPE_IQ2_S ; break ;
case LLAMA_FTYPE_MOSTLY_IQ3_XXS : default_type = GGML_TYPE_IQ3_XXS ; break ;
case LLAMA_FTYPE_MOSTLY_IQ1_S : default_type = GGML_TYPE_IQ1_S ; break ;
case LLAMA_FTYPE_MOSTLY_IQ1_M : default_type = GGML_TYPE_IQ1_M ; break ;
case LLAMA_FTYPE_MOSTLY_IQ4_NL : default_type = GGML_TYPE_IQ4_NL ; break ;
case LLAMA_FTYPE_MOSTLY_IQ4_XS : default_type = GGML_TYPE_IQ4_XS ; break ;
case LLAMA_FTYPE_MOSTLY_IQ3_S : default_type = GGML_TYPE_IQ3_S ; break ;
case LLAMA_FTYPE_MOSTLY_IQ3_M : default_type = GGML_TYPE_IQ3_S ; break ;
default : throw std : : runtime_error ( format ( " invalid output file type %d \n " , ftype ) ) ;
}
int nthread = params - > nthread ;
if ( nthread < = 0 ) {
nthread = std : : thread : : hardware_concurrency ( ) ;
}
// mmap consistently increases speed Linux, and also increases speed on Windows with
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
# if defined(__linux__) || defined(_WIN32)
constexpr bool use_mmap = true ;
# else
constexpr bool use_mmap = false ;
# endif
llama_model_kv_override * kv_overrides = nullptr ;
if ( params - > kv_overrides ) {
auto v = ( std : : vector < llama_model_kv_override > * ) params - > kv_overrides ;
kv_overrides = v - > data ( ) ;
}
llama_model_loader ml ( fname_inp , use_mmap , /*check_tensors*/ true , kv_overrides ) ;
ml . init_mappings ( false ) ; // no prefetching
llama_model model ;
llm_load_arch ( ml , model ) ;
llm_load_hparams ( ml , model ) ;
llm_load_stats ( ml , model ) ;
2025-01-06 09:52:01 +01:00
struct quantize_state_impl qs ( model , params ) ;
2025-01-03 09:18:53 +01:00
if ( params - > only_copy ) {
ftype = model . ftype ;
}
const std : : unordered_map < std : : string , std : : vector < float > > * imatrix_data = nullptr ;
if ( params - > imatrix ) {
imatrix_data = static_cast < const std : : unordered_map < std : : string , std : : vector < float > > * > ( params - > imatrix ) ;
if ( imatrix_data ) {
LLAMA_LOG_INFO ( " ================================ Have weights data with %d entries \n " , int ( imatrix_data - > size ( ) ) ) ;
qs . has_imatrix = true ;
// check imatrix for nans or infs
for ( const auto & kv : * imatrix_data ) {
for ( float f : kv . second ) {
if ( ! std : : isfinite ( f ) ) {
throw std : : runtime_error ( format ( " imatrix contains non-finite value %f \n " , f ) ) ;
}
}
}
}
}
const size_t align = GGUF_DEFAULT_ALIGNMENT ;
gguf_context_ptr ctx_out { gguf_init_empty ( ) } ;
// copy the KV pairs from the input file
gguf_set_kv ( ctx_out . get ( ) , ml . meta . get ( ) ) ;
gguf_set_val_u32 ( ctx_out . get ( ) , " general.quantization_version " , GGML_QNT_VERSION ) ; // TODO: use LLM_KV
gguf_set_val_u32 ( ctx_out . get ( ) , " general.file_type " , ftype ) ; // TODO: use LLM_KV
// Remove split metadata
gguf_remove_key ( ctx_out . get ( ) , ml . llm_kv ( LLM_KV_SPLIT_NO ) . c_str ( ) ) ;
gguf_remove_key ( ctx_out . get ( ) , ml . llm_kv ( LLM_KV_SPLIT_COUNT ) . c_str ( ) ) ;
gguf_remove_key ( ctx_out . get ( ) , ml . llm_kv ( LLM_KV_SPLIT_TENSORS_COUNT ) . c_str ( ) ) ;
if ( params - > kv_overrides ) {
const std : : vector < llama_model_kv_override > & overrides = * ( const std : : vector < llama_model_kv_override > * ) params - > kv_overrides ;
for ( const auto & o : overrides ) {
if ( o . key [ 0 ] = = 0 ) break ;
if ( o . tag = = LLAMA_KV_OVERRIDE_TYPE_FLOAT ) {
gguf_set_val_f32 ( ctx_out . get ( ) , o . key , o . val_f64 ) ;
} else if ( o . tag = = LLAMA_KV_OVERRIDE_TYPE_INT ) {
gguf_set_val_i32 ( ctx_out . get ( ) , o . key , o . val_i64 ) ;
} else if ( o . tag = = LLAMA_KV_OVERRIDE_TYPE_BOOL ) {
gguf_set_val_bool ( ctx_out . get ( ) , o . key , o . val_bool ) ;
} else if ( o . tag = = LLAMA_KV_OVERRIDE_TYPE_STR ) {
gguf_set_val_str ( ctx_out . get ( ) , o . key , o . val_str ) ;
} else {
LLAMA_LOG_WARN ( " %s: unknown KV override type for key %s \n " , __func__ , o . key ) ;
}
}
}
// make a list of weights
std : : vector < const llama_model_loader : : llama_tensor_weight * > tensors ;
tensors . reserve ( ml . weights_map . size ( ) ) ;
for ( const auto & it : ml . weights_map ) {
tensors . push_back ( & it . second ) ;
}
// keep_split requires that the weights are sorted by split index
if ( params - > keep_split ) {
std : : sort ( tensors . begin ( ) , tensors . end ( ) , [ ] ( const llama_model_loader : : llama_tensor_weight * a , const llama_model_loader : : llama_tensor_weight * b ) {
if ( a - > idx = = b - > idx ) {
return a - > offs < b - > offs ;
}
return a - > idx < b - > idx ;
} ) ;
}
for ( const auto * it : tensors ) {
const struct ggml_tensor * tensor = it - > tensor ;
const std : : string name = ggml_get_name ( tensor ) ;
// TODO: avoid hardcoded tensor names - use the TN_* constants
if ( name . find ( " attn_v.weight " ) ! = std : : string : : npos | |
name . find ( " attn_qkv.weight " ) ! = std : : string : : npos | |
name . find ( " attn_kv_b.weight " ) ! = std : : string : : npos ) {
+ + qs . n_attention_wv ;
} else if ( name = = LLM_TN ( model . arch ) ( LLM_TENSOR_OUTPUT , " weight " ) ) {
qs . has_output = true ;
}
}
qs . n_ffn_down = qs . n_ffn_gate = qs . n_ffn_up = ( int ) model . hparams . n_layer ;
// sanity checks
{
const auto & n_head_kv_iter = model . hparams . n_head_kv_arr . begin ( ) ;
// attention layers have a non-zero number of kv heads
int32_t n_attn_layer = model . hparams . n_layer - std : : count ( n_head_kv_iter , n_head_kv_iter + model . hparams . n_layer , 0 ) ;
if ( llama_model_has_encoder ( & model ) ) {
n_attn_layer * = 3 ;
}
GGML_ASSERT ( ( qs . n_attention_wv = = n_attn_layer ) & & " n_attention_wv is unexpected " ) ;
}
size_t total_size_org = 0 ;
size_t total_size_new = 0 ;
std : : vector < std : : thread > workers ;
workers . reserve ( nthread ) ;
int idx = 0 ;
std : : vector < no_init < uint8_t > > read_data ;
std : : vector < no_init < uint8_t > > work ;
std : : vector < no_init < float > > f32_conv_buf ;
uint16_t n_split = 1 ;
// Assume split index is continuous
if ( params - > keep_split ) {
for ( const auto * it : tensors ) {
n_split = std : : max ( uint16_t ( it - > idx + 1 ) , n_split ) ;
}
}
std : : vector < gguf_context_ptr > ctx_outs ( n_split ) ;
ctx_outs [ 0 ] = std : : move ( ctx_out ) ;
// populate the original tensors so we get an initial meta data
for ( const auto * it : tensors ) {
uint16_t i_split = params - > keep_split ? it - > idx : 0 ;
struct ggml_tensor * tensor = it - > tensor ;
if ( ! ctx_outs [ i_split ] ) {
ctx_outs [ i_split ] . reset ( gguf_init_empty ( ) ) ;
}
gguf_add_tensor ( ctx_outs [ i_split ] . get ( ) , tensor ) ;
}
// Set split info if needed
if ( n_split > 1 ) {
for ( size_t i = 0 ; i < ctx_outs . size ( ) ; + + i ) {
gguf_set_val_u16 ( ctx_outs [ i ] . get ( ) , ml . llm_kv ( LLM_KV_SPLIT_NO ) . c_str ( ) , i ) ;
gguf_set_val_u16 ( ctx_outs [ i ] . get ( ) , ml . llm_kv ( LLM_KV_SPLIT_COUNT ) . c_str ( ) , n_split ) ;
gguf_set_val_i32 ( ctx_outs [ i ] . get ( ) , ml . llm_kv ( LLM_KV_SPLIT_TENSORS_COUNT ) . c_str ( ) , ml . n_tensors ) ;
}
}
int cur_split = - 1 ;
std : : ofstream fout ;
auto close_ofstream = [ & ] ( ) {
// Write metadata and close file handler
if ( fout . is_open ( ) ) {
fout . seekp ( 0 ) ;
std : : vector < uint8_t > data ( gguf_get_meta_size ( ctx_outs [ cur_split ] . get ( ) ) ) ;
gguf_get_meta_data ( ctx_outs [ cur_split ] . get ( ) , data . data ( ) ) ;
fout . write ( ( const char * ) data . data ( ) , data . size ( ) ) ;
fout . close ( ) ;
}
} ;
auto new_ofstream = [ & ] ( int index ) {
cur_split = index ;
GGML_ASSERT ( ctx_outs [ cur_split ] & & " Find uninitialized gguf_context " ) ;
std : : string fname = fname_out ;
if ( params - > keep_split ) {
std : : vector < char > split_path ( llama_path_max ( ) , 0 ) ;
llama_split_path ( split_path . data ( ) , split_path . size ( ) , fname_out . c_str ( ) , cur_split , n_split ) ;
fname = std : : string ( split_path . data ( ) ) ;
}
fout = std : : ofstream ( fname , std : : ios : : binary ) ;
fout . exceptions ( std : : ofstream : : failbit ) ; // fail fast on write errors
const size_t meta_size = gguf_get_meta_size ( ctx_outs [ cur_split ] . get ( ) ) ;
// placeholder for the meta data
: : zeros ( fout , meta_size ) ;
} ;
const auto tn = LLM_TN ( model . arch ) ;
new_ofstream ( 0 ) ;
for ( const auto * it : tensors ) {
const auto & weight = * it ;
struct ggml_tensor * tensor = weight . tensor ;
if ( weight . idx ! = cur_split & & params - > keep_split ) {
close_ofstream ( ) ;
new_ofstream ( weight . idx ) ;
}
const std : : string name = ggml_get_name ( tensor ) ;
if ( ! ml . use_mmap ) {
if ( read_data . size ( ) < ggml_nbytes ( tensor ) ) {
read_data . resize ( ggml_nbytes ( tensor ) ) ;
}
tensor - > data = read_data . data ( ) ;
}
ml . load_data_for ( tensor ) ;
LLAMA_LOG_INFO ( " [%4d/%4d] %36s - [%s], type = %6s, " ,
+ + idx , ml . n_tensors ,
ggml_get_name ( tensor ) ,
llama_format_tensor_shape ( tensor ) . c_str ( ) ,
ggml_type_name ( tensor - > type ) ) ;
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name . rfind ( " weight " ) = = name . size ( ) - 6 ; // ends with 'weight'?
// quantize only 2D and 3D tensors (experts)
quantize & = ( ggml_n_dims ( tensor ) > = 2 ) ;
// do not quantize norm tensors
quantize & = name . find ( " _norm.weight " ) = = std : : string : : npos ;
quantize & = params - > quantize_output_tensor | | name ! = " output.weight " ;
quantize & = ! params - > only_copy ;
// do not quantize expert gating tensors
// NOTE: can't use LLM_TN here because the layer number is not known
quantize & = name . find ( " ffn_gate_inp.weight " ) = = std : : string : : npos ;
// do not quantize positional embeddings and token types (BERT)
quantize & = name ! = LLM_TN ( model . arch ) ( LLM_TENSOR_POS_EMBD , " weight " ) ;
quantize & = name ! = LLM_TN ( model . arch ) ( LLM_TENSOR_TOKEN_TYPES , " weight " ) ;
// do not quantize Mamba's small yet 2D weights
// NOTE: can't use LLM_TN here because the layer number is not known
quantize & = name . find ( " ssm_conv1d.weight " ) = = std : : string : : npos ;
// do not quantize RWKV's time_mix_first tensors
quantize & = name . find ( " time_mix_first.weight " ) = = std : : string : : npos ;
quantize & = name . find ( " time_mix_w1.weight " ) = = std : : string : : npos ;
quantize & = name . find ( " time_mix_w2.weight " ) = = std : : string : : npos ;
quantize & = name . find ( " time_mix_decay_w1.weight " ) = = std : : string : : npos ;
quantize & = name . find ( " time_mix_decay_w2.weight " ) = = std : : string : : npos ;
// do not quantize relative position bias (T5)
quantize & = name . find ( " attn_rel_b.weight " ) = = std : : string : : npos ;
enum ggml_type new_type ;
void * new_data ;
size_t new_size ;
if ( quantize ) {
new_type = default_type ;
// get more optimal quantization type based on the tensor shape, layer, etc.
if ( ! params - > pure & & ggml_is_quantized ( default_type ) ) {
new_type = llama_tensor_get_type ( qs , new_type , tensor , ftype ) ;
}
if ( params - > token_embedding_type < GGML_TYPE_COUNT & & strcmp ( tensor - > name , " token_embd.weight " ) = = 0 ) {
new_type = params - > token_embedding_type ;
}
if ( params - > output_tensor_type < GGML_TYPE_COUNT & & strcmp ( tensor - > name , " output.weight " ) = = 0 ) {
new_type = params - > output_tensor_type ;
}
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor - > type ! = new_type ;
}
if ( ! quantize ) {
new_type = tensor - > type ;
new_data = tensor - > data ;
new_size = ggml_nbytes ( tensor ) ;
LLAMA_LOG_INFO ( " size = %8.3f MB \n " , ggml_nbytes ( tensor ) / 1024.0 / 1024.0 ) ;
} else {
const int64_t nelements = ggml_nelements ( tensor ) ;
const float * imatrix = nullptr ;
if ( imatrix_data ) {
auto it = imatrix_data - > find ( tensor - > name ) ;
if ( it = = imatrix_data - > end ( ) ) {
LLAMA_LOG_INFO ( " \n ====== %s: did not find weights for %s \n " , __func__ , tensor - > name ) ;
} else {
if ( it - > second . size ( ) = = ( size_t ) tensor - > ne [ 0 ] * tensor - > ne [ 2 ] ) {
imatrix = it - > second . data ( ) ;
} else {
LLAMA_LOG_INFO ( " \n ====== %s: imatrix size %d is different from tensor size %d for %s \n " , __func__ ,
int ( it - > second . size ( ) ) , int ( tensor - > ne [ 0 ] * tensor - > ne [ 2 ] ) , tensor - > name ) ;
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
// this is a significant error and it may be good idea to abort the process if this happens,
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
// tok_embd should be ignored in this case, since it always causes this warning
if ( name ! = tn ( LLM_TENSOR_TOKEN_EMBD , " weight " ) ) {
throw std : : runtime_error ( format ( " imatrix size %d is different from tensor size %d for %s " ,
int ( it - > second . size ( ) ) , int ( tensor - > ne [ 0 ] * tensor - > ne [ 2 ] ) , tensor - > name ) ) ;
}
}
}
}
if ( ( new_type = = GGML_TYPE_IQ2_XXS | |
new_type = = GGML_TYPE_IQ2_XS | |
new_type = = GGML_TYPE_IQ2_S | |
new_type = = GGML_TYPE_IQ1_S | |
( new_type = = GGML_TYPE_IQ1_M & & strcmp ( tensor - > name , " token_embd.weight " ) & & strcmp ( tensor - > name , " output.weight " ) ) | |
( new_type = = GGML_TYPE_Q2_K & & params - > ftype = = LLAMA_FTYPE_MOSTLY_Q2_K_S & & strcmp ( tensor - > name , " token_embd.weight " ) ! = 0 ) ) & & ! imatrix ) {
LLAMA_LOG_ERROR ( " \n \n ============================================================ \n " ) ;
LLAMA_LOG_ERROR ( " Missing importance matrix for tensor %s in a very low-bit quantization \n " , tensor - > name ) ;
LLAMA_LOG_ERROR ( " The result will be garbage, so bailing out \n " ) ;
LLAMA_LOG_ERROR ( " ============================================================ \n \n " ) ;
throw std : : runtime_error ( format ( " Missing importance matrix for tensor %s in a very low-bit quantization " , tensor - > name ) ) ;
}
float * f32_data ;
if ( tensor - > type = = GGML_TYPE_F32 ) {
f32_data = ( float * ) tensor - > data ;
} else if ( ggml_is_quantized ( tensor - > type ) & & ! params - > allow_requantize ) {
throw std : : runtime_error ( format ( " requantizing from type %s is disabled " , ggml_type_name ( tensor - > type ) ) ) ;
} else {
2025-01-06 09:52:01 +01:00
llama_tensor_dequantize_impl ( tensor , f32_conv_buf , workers , nelements , nthread ) ;
2025-01-03 09:18:53 +01:00
f32_data = ( float * ) f32_conv_buf . data ( ) ;
}
LLAMA_LOG_INFO ( " converting to %s .. " , ggml_type_name ( new_type ) ) ;
fflush ( stdout ) ;
if ( work . size ( ) < ( size_t ) nelements * 4 ) {
work . resize ( nelements * 4 ) ; // upper bound on size
}
new_data = work . data ( ) ;
const int64_t n_per_row = tensor - > ne [ 0 ] ;
const int64_t nrows = tensor - > ne [ 1 ] ;
static const int64_t min_chunk_size = 32 * 512 ;
const int64_t chunk_size = ( n_per_row > = min_chunk_size ? n_per_row : n_per_row * ( ( min_chunk_size + n_per_row - 1 ) / n_per_row ) ) ;
const int64_t nelements_matrix = tensor - > ne [ 0 ] * tensor - > ne [ 1 ] ;
const int64_t nchunk = ( nelements_matrix + chunk_size - 1 ) / chunk_size ;
const int64_t nthread_use = nthread > 1 ? std : : max ( ( int64_t ) 1 , std : : min ( ( int64_t ) nthread , nchunk ) ) : 1 ;
// quantize each expert separately since they have different importance matrices
new_size = 0 ;
for ( int64_t i03 = 0 ; i03 < tensor - > ne [ 2 ] ; + + i03 ) {
const float * f32_data_03 = f32_data + i03 * nelements_matrix ;
void * new_data_03 = ( char * ) new_data + ggml_row_size ( new_type , n_per_row ) * i03 * nrows ;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr ;
2025-01-06 09:52:01 +01:00
new_size + = llama_tensor_quantize_impl ( new_type , f32_data_03 , new_data_03 , chunk_size , nrows , n_per_row , imatrix_03 , workers , nthread_use ) ;
2025-01-03 09:18:53 +01:00
}
LLAMA_LOG_INFO ( " size = %8.2f MiB -> %8.2f MiB \n " , ggml_nbytes ( tensor ) / 1024.0 / 1024.0 , new_size / 1024.0 / 1024.0 ) ;
}
total_size_org + = ggml_nbytes ( tensor ) ;
total_size_new + = new_size ;
// update the gguf meta data as we go
gguf_set_tensor_type ( ctx_outs [ cur_split ] . get ( ) , name . c_str ( ) , new_type ) ;
2025-01-07 18:01:58 +01:00
GGML_ASSERT ( gguf_get_tensor_size ( ctx_outs [ cur_split ] . get ( ) , gguf_find_tensor ( ctx_outs [ cur_split ] . get ( ) , name . c_str ( ) ) ) = = new_size ) ;
gguf_set_tensor_data ( ctx_outs [ cur_split ] . get ( ) , name . c_str ( ) , new_data ) ;
2025-01-03 09:18:53 +01:00
// write tensor data + padding
fout . write ( ( const char * ) new_data , new_size ) ;
zeros ( fout , GGML_PAD ( new_size , align ) - new_size ) ;
}
close_ofstream ( ) ;
LLAMA_LOG_INFO ( " %s: model size = %8.2f MB \n " , __func__ , total_size_org / 1024.0 / 1024.0 ) ;
LLAMA_LOG_INFO ( " %s: quant size = %8.2f MB \n " , __func__ , total_size_new / 1024.0 / 1024.0 ) ;
if ( qs . n_fallback > 0 ) {
LLAMA_LOG_WARN ( " %s: WARNING: %d of %d tensor(s) required fallback quantization \n " ,
__func__ , qs . n_fallback , qs . n_k_quantized + qs . n_fallback ) ;
}
}
//
// interface implementation
//
struct llama_model_quantize_params llama_model_quantize_default_params ( ) {
struct llama_model_quantize_params result = {
/*.nthread =*/ 0 ,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1 ,
/*.output_tensor_type =*/ GGML_TYPE_COUNT ,
/*.token_embedding_type =*/ GGML_TYPE_COUNT ,
/*.allow_requantize =*/ false ,
/*.quantize_output_tensor =*/ true ,
/*.only_copy =*/ false ,
/*.pure =*/ false ,
/*.keep_split =*/ false ,
/*.imatrix =*/ nullptr ,
/*.kv_overrides =*/ nullptr ,
} ;
return result ;
}
uint32_t llama_model_quantize (
const char * fname_inp ,
const char * fname_out ,
const llama_model_quantize_params * params ) {
try {
2025-01-06 09:52:01 +01:00
llama_model_quantize_impl ( fname_inp , fname_out , params ) ;
2025-01-03 09:18:53 +01:00
} catch ( const std : : exception & err ) {
LLAMA_LOG_ERROR ( " %s: failed to quantize: %s \n " , __func__ , err . what ( ) ) ;
return 1 ;
}
return 0 ;
}