2023-04-22 11:10:39 +02:00
|
|
|
// Unit tests for quantization specific functions - quantize, dequantize and dot product
|
|
|
|
|
|
|
|
#include "ggml.h"
|
|
|
|
|
|
|
|
#undef NDEBUG
|
|
|
|
#include <assert.h>
|
|
|
|
#include <math.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
2023-06-16 21:23:53 +03:00
|
|
|
#if defined(_MSC_VER)
|
|
|
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
|
|
#endif
|
|
|
|
|
2023-09-15 15:38:27 -04:00
|
|
|
constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
|
|
|
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-05 21:48:47 -04:00
|
|
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
|
2023-09-15 15:38:27 -04:00
|
|
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
|
|
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
2024-01-30 15:14:12 +02:00
|
|
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
|
2023-09-15 15:38:27 -04:00
|
|
|
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
2024-01-30 15:14:12 +02:00
|
|
|
constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-05 21:48:47 -04:00
|
|
|
constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;
|
2023-04-22 11:10:39 +02:00
|
|
|
|
2023-09-15 15:38:27 -04:00
|
|
|
static const char* RESULT_STR[] = {"ok", "FAILED"};
|
2023-04-22 11:10:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
// Generate synthetic data
|
2023-09-15 15:38:27 -04:00
|
|
|
static void generate_data(float offset, size_t n, float * dst) {
|
2023-04-22 11:10:39 +02:00
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
dst[i] = 0.1 + 2*cosf(i + offset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Calculate RMSE between two float arrays
|
2023-09-15 15:38:27 -04:00
|
|
|
static float array_rmse(const float * a1, const float * a2, size_t n) {
|
2023-04-22 11:10:39 +02:00
|
|
|
double sum = 0;
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
double diff = a1[i] - a2[i];
|
|
|
|
sum += diff * diff;
|
|
|
|
}
|
|
|
|
return sqrtf(sum) / n;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Total quantization error on test data
|
2023-09-15 15:38:27 -04:00
|
|
|
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
2023-04-25 23:40:51 +03:00
|
|
|
std::vector<uint8_t> tmp_q(2*test_size);
|
2023-04-22 11:10:39 +02:00
|
|
|
std::vector<float> tmp_out(test_size);
|
|
|
|
|
2023-07-05 16:13:06 +00:00
|
|
|
qfns.from_float(test_data, tmp_q.data(), test_size);
|
|
|
|
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
2023-04-22 11:10:39 +02:00
|
|
|
return array_rmse(test_data, tmp_out.data(), test_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Total quantization error on test data
|
2023-09-15 15:38:27 -04:00
|
|
|
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
2023-04-25 23:40:51 +03:00
|
|
|
std::vector<uint8_t> tmp_q(2*test_size);
|
2023-04-22 11:10:39 +02:00
|
|
|
std::vector<float> tmp_out(test_size);
|
|
|
|
std::vector<float> tmp_out_ref(test_size);
|
|
|
|
|
2023-07-05 16:13:06 +00:00
|
|
|
qfns.from_float(test_data, tmp_q.data(), test_size);
|
|
|
|
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
2023-04-22 11:10:39 +02:00
|
|
|
|
2024-07-12 10:46:02 +03:00
|
|
|
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
|
2023-07-05 16:13:06 +00:00
|
|
|
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
2023-04-22 11:10:39 +02:00
|
|
|
|
|
|
|
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
|
|
|
}
|
|
|
|
|
2023-09-15 15:38:27 -04:00
|
|
|
static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
2023-04-22 11:10:39 +02:00
|
|
|
double sum = 0;
|
|
|
|
for (size_t i = 0; i < test_size; i++) {
|
|
|
|
sum += a1[i] * a2[i];
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Total dot product error
|
2023-09-15 15:38:27 -04:00
|
|
|
static float dot_product_error(
|
|
|
|
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
|
|
|
|
) {
|
2023-04-25 23:40:51 +03:00
|
|
|
std::vector<uint8_t> tmp_q1(2*test_size);
|
|
|
|
std::vector<uint8_t> tmp_q2(2*test_size);
|
2023-04-22 11:10:39 +02:00
|
|
|
|
2023-07-05 16:13:06 +00:00
|
|
|
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
|
|
|
|
|
|
|
qfns.from_float(test_data1, tmp_q1.data(), test_size);
|
|
|
|
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
2023-04-22 11:10:39 +02:00
|
|
|
|
|
|
|
float result = INFINITY;
|
2024-02-11 07:22:33 -06:00
|
|
|
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
2023-04-22 11:10:39 +02:00
|
|
|
|
|
|
|
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
|
|
|
|
|
|
|
return fabsf(result - dot_ref) / test_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char * argv[]) {
|
|
|
|
bool verbose = false;
|
|
|
|
const size_t test_size = 32 * 128;
|
|
|
|
|
|
|
|
std::string arg;
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
|
|
arg = argv[i];
|
|
|
|
|
|
|
|
if (arg == "-v") {
|
|
|
|
verbose = true;
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<float> test_data(test_size);
|
|
|
|
std::vector<float> test_data2(test_size);
|
|
|
|
|
|
|
|
generate_data(0.0, test_data.size(), test_data.data());
|
|
|
|
generate_data(1.0, test_data2.size(), test_data2.data());
|
|
|
|
|
|
|
|
// Initialize GGML, ensures float conversion tables are initialized
|
|
|
|
struct ggml_init_params ggml_params = {
|
|
|
|
/* .mem_size = */ 1*1024,
|
|
|
|
/* .mem_buffer = */ NULL,
|
|
|
|
/* .no_alloc = */ true,
|
|
|
|
};
|
|
|
|
struct ggml_context * ctx = ggml_init(ggml_params);
|
|
|
|
|
|
|
|
int num_failed = 0;
|
|
|
|
bool failed = false;
|
|
|
|
|
|
|
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
|
|
|
ggml_type type = (ggml_type) i;
|
2023-07-05 16:13:06 +00:00
|
|
|
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
2023-04-22 11:10:39 +02:00
|
|
|
|
2023-10-30 19:19:15 +02:00
|
|
|
// deprecated - skip
|
|
|
|
if (qfns.blck_size == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2024-01-11 20:39:39 +01:00
|
|
|
const ggml_type ei = (ggml_type)i;
|
2024-01-30 15:14:12 +02:00
|
|
|
|
2023-10-30 19:19:15 +02:00
|
|
|
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
2024-01-30 15:14:12 +02:00
|
|
|
ggml_quantize_init(ei);
|
2023-10-30 19:19:15 +02:00
|
|
|
|
2023-07-05 16:13:06 +00:00
|
|
|
if (qfns.from_float && qfns.to_float) {
|
2023-04-22 11:10:39 +02:00
|
|
|
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 22:56:18 +03:00
|
|
|
const float max_quantization_error =
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-05 21:48:47 -04:00
|
|
|
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
|
|
|
type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
2024-01-30 15:14:12 +02:00
|
|
|
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
2024-02-26 18:28:38 +02:00
|
|
|
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
2024-01-30 15:14:12 +02:00
|
|
|
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
2024-02-24 16:23:52 +02:00
|
|
|
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
2024-01-30 15:14:12 +02:00
|
|
|
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 22:56:18 +03:00
|
|
|
failed = !(total_error < max_quantization_error);
|
2023-04-22 11:10:39 +02:00
|
|
|
num_failed += failed;
|
|
|
|
if (failed || verbose) {
|
2023-04-25 23:40:51 +03:00
|
|
|
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
2023-04-22 11:10:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
|
|
|
|
failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
|
|
|
|
num_failed += failed;
|
|
|
|
if (failed || verbose) {
|
|
|
|
printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
|
|
|
|
}
|
|
|
|
|
|
|
|
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
2024-01-30 15:14:12 +02:00
|
|
|
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
2024-02-26 18:28:38 +02:00
|
|
|
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
|
|
|
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-05 21:48:47 -04:00
|
|
|
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
|
|
|
|
? MAX_DOT_PRODUCT_ERROR_TERNARY
|
2024-02-24 16:23:52 +02:00
|
|
|
: MAX_DOT_PRODUCT_ERROR;
|
2024-01-30 15:14:12 +02:00
|
|
|
failed = !(vec_dot_error < max_allowed_error);
|
2023-04-22 11:10:39 +02:00
|
|
|
num_failed += failed;
|
|
|
|
if (failed || verbose) {
|
2023-04-25 23:40:51 +03:00
|
|
|
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
|
2023-04-22 11:10:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (num_failed || verbose) {
|
|
|
|
printf("%d tests failed\n", num_failed);
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_free(ctx);
|
|
|
|
|
|
|
|
return num_failed > 0;
|
|
|
|
}
|