2023-05-01 18:23:47 +02:00
# include "build-info.h"
2023-03-10 19:40:58 +01:00
2023-05-20 10:06:11 +02:00
# include "llama.h"
2023-03-10 19:40:58 +01:00
# include <cstdio>
2023-06-10 09:59:17 +02:00
# include <cstring>
2023-04-26 18:43:27 +02:00
# include <map>
2023-03-10 19:40:58 +01:00
# include <string>
2023-03-21 18:21:50 +01:00
2023-05-05 00:58:56 +02:00
static const std : : map < std : : string , llama_ftype > LLAMA_FTYPE_MAP = {
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
{ " q4_0 " , LLAMA_FTYPE_MOSTLY_Q4_0 } ,
{ " q4_1 " , LLAMA_FTYPE_MOSTLY_Q4_1 } ,
{ " q5_0 " , LLAMA_FTYPE_MOSTLY_Q5_0 } ,
{ " q5_1 " , LLAMA_FTYPE_MOSTLY_Q5_1 } ,
{ " q8_0 " , LLAMA_FTYPE_MOSTLY_Q8_0 } ,
{ " q2_K " , LLAMA_FTYPE_MOSTLY_Q2_K } ,
{ " q3_K " , LLAMA_FTYPE_MOSTLY_Q3_K_M } ,
{ " q3_K_S " , LLAMA_FTYPE_MOSTLY_Q3_K_S } ,
{ " q3_K_M " , LLAMA_FTYPE_MOSTLY_Q3_K_M } ,
{ " q3_K_L " , LLAMA_FTYPE_MOSTLY_Q3_K_L } ,
{ " q4_K " , LLAMA_FTYPE_MOSTLY_Q4_K_M } ,
{ " q4_K_S " , LLAMA_FTYPE_MOSTLY_Q4_K_S } ,
{ " q4_K_M " , LLAMA_FTYPE_MOSTLY_Q4_K_M } ,
{ " q5_K " , LLAMA_FTYPE_MOSTLY_Q5_K_M } ,
{ " q5_K_S " , LLAMA_FTYPE_MOSTLY_Q5_K_S } ,
{ " q5_K_M " , LLAMA_FTYPE_MOSTLY_Q5_K_M } ,
{ " q6_K " , LLAMA_FTYPE_MOSTLY_Q6_K } ,
2023-04-26 18:43:27 +02:00
} ;
2023-05-05 00:58:56 +02:00
bool try_parse_ftype ( const std : : string & ftype_str , llama_ftype & ftype , std : : string & ftype_str_out ) {
auto it = LLAMA_FTYPE_MAP . find ( ftype_str ) ;
if ( it ! = LLAMA_FTYPE_MAP . end ( ) ) {
ftype = it - > second ;
ftype_str_out = it - > first ;
return true ;
}
// try to parse as an integer
try {
int ftype_int = std : : stoi ( ftype_str ) ;
for ( auto it = LLAMA_FTYPE_MAP . begin ( ) ; it ! = LLAMA_FTYPE_MAP . end ( ) ; it + + ) {
if ( it - > second = = ftype_int ) {
ftype = it - > second ;
ftype_str_out = it - > first ;
return true ;
}
}
}
catch ( . . . ) {
// stoi failed
}
return false ;
}
2023-03-10 19:40:58 +01:00
// usage:
2023-05-05 00:58:56 +02:00
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
2023-03-10 19:40:58 +01:00
//
2023-06-10 09:59:17 +02:00
void usage ( const char * executable ) {
fprintf ( stderr , " usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads] \n " , executable ) ;
fprintf ( stderr , " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit \n " ) ;
fprintf ( stderr , " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing \n " ) ;
fprintf ( stderr , " Allowed quantization types: \n " ) ;
for ( auto it = LLAMA_FTYPE_MAP . begin ( ) ; it ! = LLAMA_FTYPE_MAP . end ( ) ; it + + ) {
fprintf ( stderr , " type = \" %s \" or %d \n " , it - > first . c_str ( ) , it - > second ) ;
}
exit ( 1 ) ;
}
2023-03-10 19:40:58 +01:00
int main ( int argc , char * * argv ) {
2023-05-05 00:58:56 +02:00
if ( argc < 3 ) {
2023-06-10 09:59:17 +02:00
usage ( argv [ 0 ] ) ;
}
llama_model_quantize_params params = llama_model_quantize_default_params ( ) ;
int arg_idx = 1 ;
for ( ; arg_idx < argc & & strncmp ( argv [ arg_idx ] , " -- " , 2 ) = = 0 ; arg_idx + + ) {
if ( strcmp ( argv [ arg_idx ] , " --leave-output-tensor " ) = = 0 ) {
params . quantize_output_tensor = false ;
} else if ( strcmp ( argv [ arg_idx ] , " --allow-requantize " ) = = 0 ) {
params . allow_requantize = true ;
} else {
usage ( argv [ 0 ] ) ;
2023-04-26 18:43:27 +02:00
}
2023-06-10 09:59:17 +02:00
}
if ( argc - arg_idx < 3 ) {
usage ( argv [ 0 ] ) ;
2023-03-10 19:40:58 +01:00
}
2023-05-20 10:06:11 +02:00
llama_init_backend ( ) ;
2023-03-11 16:40:14 +01:00
2023-05-05 00:58:56 +02:00
// parse command line arguments
2023-06-10 09:59:17 +02:00
const std : : string fname_inp = argv [ arg_idx ] ;
arg_idx + + ;
2023-05-05 00:58:56 +02:00
std : : string fname_out ;
std : : string ftype_str ;
2023-06-10 09:59:17 +02:00
if ( try_parse_ftype ( argv [ arg_idx ] , params . ftype , ftype_str ) ) {
2023-05-05 00:58:56 +02:00
std : : string fpath ;
const size_t pos = fname_inp . find_last_of ( ' / ' ) ;
if ( pos ! = std : : string : : npos ) {
fpath = fname_inp . substr ( 0 , pos + 1 ) ;
}
// export as [inp path]/ggml-model-[ftype].bin
fname_out = fpath + " ggml-model- " + ftype_str + " .bin " ;
arg_idx + + ;
}
else {
fname_out = argv [ arg_idx ] ;
arg_idx + + ;
2023-03-10 19:40:58 +01:00
2023-05-05 00:58:56 +02:00
if ( argc < = arg_idx ) {
fprintf ( stderr , " %s: missing ftype \n " , __func__ ) ;
return 1 ;
}
2023-06-10 09:59:17 +02:00
if ( ! try_parse_ftype ( argv [ arg_idx ] , params . ftype , ftype_str ) ) {
2023-05-05 00:58:56 +02:00
fprintf ( stderr , " %s: invalid ftype '%s' \n " , __func__ , argv [ 3 ] ) ;
return 1 ;
}
arg_idx + + ;
}
// parse nthreads
if ( argc > arg_idx ) {
try {
2023-06-10 09:59:17 +02:00
params . nthread = std : : stoi ( argv [ arg_idx ] ) ;
2023-05-05 00:58:56 +02:00
}
catch ( const std : : exception & e ) {
fprintf ( stderr , " %s: invalid nthread '%s' (%s) \n " , __func__ , argv [ arg_idx ] , e . what ( ) ) ;
2023-04-26 18:43:27 +02:00
return 1 ;
}
}
2023-05-01 18:23:47 +02:00
fprintf ( stderr , " %s: build = %d (%s) \n " , __func__ , BUILD_NUMBER , BUILD_COMMIT ) ;
2023-05-05 00:58:56 +02:00
fprintf ( stderr , " %s: quantizing '%s' to '%s' as %s " , __func__ , fname_inp . c_str ( ) , fname_out . c_str ( ) , ftype_str . c_str ( ) ) ;
2023-06-10 09:59:17 +02:00
if ( params . nthread > 0 ) {
fprintf ( stderr , " using %d threads " , params . nthread ) ;
2023-05-05 00:58:56 +02:00
}
fprintf ( stderr , " \n " ) ;
2023-03-10 19:40:58 +01:00
2023-05-20 10:06:11 +02:00
const int64_t t_main_start_us = llama_time_us ( ) ;
2023-03-10 19:40:58 +01:00
int64_t t_quantize_us = 0 ;
// load the model
{
2023-05-20 10:06:11 +02:00
const int64_t t_start_us = llama_time_us ( ) ;
2023-03-10 19:40:58 +01:00
2023-06-10 09:59:17 +02:00
if ( llama_model_quantize ( fname_inp . c_str ( ) , fname_out . c_str ( ) , & params ) ) {
2023-03-10 19:40:58 +01:00
fprintf ( stderr , " %s: failed to quantize model from '%s' \n " , __func__ , fname_inp . c_str ( ) ) ;
return 1 ;
}
2023-05-20 10:06:11 +02:00
t_quantize_us = llama_time_us ( ) - t_start_us ;
2023-03-10 19:40:58 +01:00
}
// report timing
{
2023-05-20 10:06:11 +02:00
const int64_t t_main_end_us = llama_time_us ( ) ;
2023-03-10 19:40:58 +01:00
printf ( " \n " ) ;
2023-03-28 18:48:20 +02:00
printf ( " %s: quantize time = %8.2f ms \n " , __func__ , t_quantize_us / 1000.0 ) ;
printf ( " %s: total time = %8.2f ms \n " , __func__ , ( t_main_end_us - t_main_start_us ) / 1000.0 ) ;
2023-03-10 19:40:58 +01:00
}
return 0 ;
}