2023-04-08 00:09:18 +02:00
|
|
|
#include "ggml.h"
|
2023-05-01 18:23:47 +02:00
|
|
|
#include "build-info.h"
|
2023-04-13 17:04:45 +02:00
|
|
|
|
|
|
|
#define LLAMA_API_INTERNAL
|
2023-04-08 00:09:18 +02:00
|
|
|
#include "llama.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <cmath>
|
|
|
|
#include <cstdio>
|
|
|
|
#include <cstring>
|
|
|
|
#include <map>
|
|
|
|
#include <numeric>
|
|
|
|
#include <regex>
|
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <vector>
|
2023-04-20 19:42:27 +02:00
|
|
|
#include <thread>
|
|
|
|
#include <mutex>
|
2023-04-08 00:09:18 +02:00
|
|
|
|
2023-06-16 20:23:53 +02:00
|
|
|
#if defined(_MSC_VER)
|
|
|
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
|
|
#endif
|
|
|
|
|
2023-04-08 00:09:18 +02:00
|
|
|
struct quantize_stats_params {
|
|
|
|
std::string model = "models/7B/ggml-model-f16.bin";
|
|
|
|
bool verbose = false;
|
|
|
|
bool per_layer_stats = false;
|
|
|
|
bool print_histogram = false;
|
|
|
|
bool reference = false;
|
|
|
|
std::vector<std::string> include_layers;
|
|
|
|
std::vector<std::string> exclude_layers;
|
|
|
|
std::vector<enum ggml_type> include_types;
|
|
|
|
};
|
|
|
|
|
|
|
|
const size_t HISTOGRAM_BUCKETS = 150;
|
|
|
|
const double HISTOGRAM_RANGE = 0.03;
|
|
|
|
|
|
|
|
struct error_stats {
|
|
|
|
size_t num_samples;
|
|
|
|
double total_error;
|
|
|
|
double max_error;
|
|
|
|
uint64_t error_histogram[HISTOGRAM_BUCKETS];
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
|
|
|
quantize_stats_params params;
|
|
|
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
fprintf(stderr, "options:\n");
|
|
|
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
|
|
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
|
|
|
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
|
|
|
fprintf(stderr, " -r, --reference\n");
|
|
|
|
fprintf(stderr, " use reference implementation (default: false)\n");
|
|
|
|
fprintf(stderr, " -v, --verbose\n");
|
|
|
|
fprintf(stderr, " verbose output (default: false)\n");
|
|
|
|
fprintf(stderr, " -p, --per-layer-stats\n");
|
|
|
|
fprintf(stderr, " print stats per layer (default: false)\n");
|
|
|
|
fprintf(stderr, " --histogram\n");
|
|
|
|
fprintf(stderr, " print error histogram (default: false)\n");
|
|
|
|
fprintf(stderr, " -l LAYER, --include-layer LAYER\n");
|
|
|
|
fprintf(stderr, " only test layers matching pattern\n");
|
|
|
|
fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n");
|
|
|
|
fprintf(stderr, " exclude layers matching pattern\n");
|
|
|
|
fprintf(stderr, " -t TYPE, --type TYPE\n");
|
|
|
|
fprintf(stderr, " only test given type (q4_0, q4_1)\n");
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if a layer is included/excluded by command line
|
|
|
|
bool layer_included(const quantize_stats_params params, const std::string & layer) {
|
|
|
|
for (const auto& excluded : params.exclude_layers) {
|
|
|
|
if (std::regex_search(layer, std::regex(excluded))) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (const auto& included : params.include_layers) {
|
|
|
|
if (std::regex_search(layer, std::regex(included))) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return params.include_layers.empty();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update error statistics given vectors with the before/after result of quantization
|
|
|
|
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
|
|
|
for (int64_t i = 0; i < nelements; i++) {
|
|
|
|
double diff = input[i] - output[i];
|
|
|
|
stats.total_error += diff * diff;
|
|
|
|
stats.max_error = fmax(fabs(diff), stats.max_error);
|
|
|
|
stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
|
|
|
|
}
|
|
|
|
stats.num_samples += nelements;
|
|
|
|
}
|
|
|
|
|
2023-04-20 19:42:27 +02:00
|
|
|
void combine_error_stats(error_stats & into, const error_stats & from) {
|
|
|
|
into.num_samples += from.num_samples;
|
|
|
|
into.total_error += from.total_error;
|
|
|
|
if (from.max_error > into.max_error) into.max_error = from.max_error;
|
|
|
|
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
|
|
|
|
}
|
|
|
|
|
2023-04-08 00:09:18 +02:00
|
|
|
double find_quantile(const error_stats & stats, double quantile) {
|
|
|
|
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
|
|
|
|
|
|
|
|
double accum = 0;
|
|
|
|
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
|
|
|
|
accum += stats.error_histogram[i];
|
|
|
|
if (accum >= sum*quantile) {
|
|
|
|
return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return INFINITY;
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
|
|
|
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
|
|
|
|
double median = find_quantile(stats, .5);
|
|
|
|
double pct95 = find_quantile(stats, .95);
|
|
|
|
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
|
|
|
|
if (print_histogram) {
|
|
|
|
printf("Error distribution:\n");
|
|
|
|
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
|
|
|
|
double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
|
|
|
double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
|
|
|
if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
|
|
|
|
printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// copied from ggml.h - verify that we can access this as a flat array
|
|
|
|
static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
|
|
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
|
|
|
|
|
|
return
|
|
|
|
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
|
|
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
|
|
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
|
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
|
|
}
|
|
|
|
|
2023-04-20 19:42:27 +02:00
|
|
|
void test_roundtrip_on_chunk(
|
|
|
|
const ggml_tensor * layer,
|
|
|
|
int64_t offset,
|
|
|
|
int64_t chunk_size,
|
|
|
|
const quantize_fns_t & qfns,
|
|
|
|
bool use_reference,
|
|
|
|
float * input_scratch,
|
|
|
|
char * quantized_scratch,
|
|
|
|
float * output_scratch,
|
|
|
|
error_stats & stats) {
|
|
|
|
|
|
|
|
if (layer->type == GGML_TYPE_F16) {
|
|
|
|
for (int i = 0; i < chunk_size; i++) {
|
|
|
|
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
input_scratch = ggml_get_data_f32(layer) + offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (use_reference) {
|
|
|
|
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
|
|
|
|
} else {
|
|
|
|
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
|
|
|
|
}
|
|
|
|
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
|
|
|
|
|
|
|
|
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-04-08 00:09:18 +02:00
|
|
|
// Run quantization function for a single layer and update error stats
|
|
|
|
void test_roundtrip_on_layer(
|
|
|
|
std::string & name,
|
|
|
|
bool print_layer_stats,
|
|
|
|
const quantize_fns_t & qfns,
|
|
|
|
bool use_reference,
|
|
|
|
const ggml_tensor * layer,
|
2023-04-20 19:42:27 +02:00
|
|
|
std::vector<float> & input_scratch,
|
|
|
|
std::vector<char> & quantized_scratch,
|
|
|
|
std::vector<float> & output_scratch,
|
|
|
|
error_stats & total_error,
|
|
|
|
int max_thread = 0) {
|
2023-04-08 00:09:18 +02:00
|
|
|
|
|
|
|
assert(tensor_is_contiguous(layer));
|
|
|
|
error_stats layer_error {};
|
2023-04-20 19:42:27 +02:00
|
|
|
uint64_t nelements = ggml_nelements(layer);
|
2023-04-08 00:09:18 +02:00
|
|
|
|
2023-04-20 19:42:27 +02:00
|
|
|
float* input_scratch_ptr = nullptr;
|
|
|
|
if (layer->type == GGML_TYPE_F16) {
|
|
|
|
if (input_scratch.size() < nelements) input_scratch.resize(nelements);
|
|
|
|
input_scratch_ptr = input_scratch.data();
|
|
|
|
}
|
|
|
|
if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
|
|
|
|
if (output_scratch.size() < nelements) output_scratch.resize(nelements);
|
|
|
|
|
|
|
|
if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
|
|
|
|
int chunk_size = 32*512;
|
|
|
|
int num_chunks = (nelements + chunk_size - 1)/chunk_size;
|
|
|
|
|
|
|
|
if (num_chunks < 2 || max_thread < 2) {
|
|
|
|
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
|
|
|
|
output_scratch.data(), print_layer_stats ? layer_error : total_error);
|
|
|
|
} else {
|
|
|
|
auto & stats = print_layer_stats ? layer_error : total_error;
|
|
|
|
std::mutex mutex;
|
|
|
|
uint64_t counter = 0;
|
|
|
|
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
|
|
|
|
&quantized_scratch, &output_scratch, chunk_size] () {
|
|
|
|
error_stats local_stats {};
|
|
|
|
while (true) {
|
|
|
|
std::unique_lock<std::mutex> lock(mutex);
|
|
|
|
uint64_t offset = counter; counter += chunk_size;
|
|
|
|
if (offset >= nelements) {
|
|
|
|
combine_error_stats(stats, local_stats);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
lock.unlock();
|
|
|
|
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
|
|
|
|
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
|
|
|
|
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
|
2023-04-08 00:09:18 +02:00
|
|
|
}
|
2023-04-20 19:42:27 +02:00
|
|
|
};
|
|
|
|
int nthread = std::min(num_chunks, max_thread);
|
|
|
|
std::vector<std::thread> workers(nthread-1);
|
|
|
|
for (auto& w : workers) w = std::thread(compute);
|
|
|
|
compute();
|
|
|
|
for (auto& w : workers) w.join();
|
2023-04-08 00:09:18 +02:00
|
|
|
}
|
2023-04-20 19:42:27 +02:00
|
|
|
|
2023-04-08 00:09:18 +02:00
|
|
|
if (print_layer_stats) {
|
|
|
|
print_error_stats(name, layer_error, false);
|
2023-04-20 19:42:27 +02:00
|
|
|
combine_error_stats(total_error, layer_error);
|
2023-04-08 00:09:18 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char ** argv) {
|
|
|
|
ggml_time_init();
|
|
|
|
|
|
|
|
quantize_stats_params params;
|
|
|
|
|
|
|
|
// read command line
|
|
|
|
|
2023-04-20 19:42:27 +02:00
|
|
|
int max_thread = 0;
|
2023-04-08 00:09:18 +02:00
|
|
|
bool invalid_param = false;
|
|
|
|
std::string arg;
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
|
|
arg = argv[i];
|
|
|
|
|
|
|
|
if (arg == "-h" || arg == "--help") {
|
|
|
|
quantize_stats_print_usage(argc, argv);
|
|
|
|
exit(0);
|
|
|
|
} else if (arg == "-r" || arg == "--reference") {
|
|
|
|
params.reference = true;
|
|
|
|
} else if (arg == "-v") {
|
|
|
|
params.verbose = true;
|
|
|
|
} else if (arg == "-p" || arg == "--per-layer-stats") {
|
|
|
|
params.per_layer_stats = true;
|
|
|
|
} else if (arg == "--histogram") {
|
|
|
|
params.print_histogram = true;
|
|
|
|
} else if (arg == "-m" || arg == "--model") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
params.model = argv[i];
|
|
|
|
} else if (arg == "-l" || arg == "--include-layer") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
params.include_layers.push_back(argv[i]);
|
|
|
|
} else if (arg == "-L" || arg == "--exclude-layer") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
params.exclude_layers.push_back(argv[i]);
|
|
|
|
} else if (arg == "-t" || arg == "--type") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
int j;
|
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
|
|
|
for (j = 0; j < GGML_TYPE_COUNT; ++j) {
|
|
|
|
const auto * name = ggml_type_name((ggml_type) j);
|
|
|
|
if (name && strcmp(argv[i], name) == 0) break;
|
2023-04-08 00:09:18 +02:00
|
|
|
}
|
|
|
|
if (j < GGML_TYPE_COUNT) {
|
|
|
|
params.include_types.push_back((ggml_type) j);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
|
|
|
|
invalid_param = true;
|
|
|
|
}
|
2023-04-20 19:42:27 +02:00
|
|
|
} else if (arg == "-n" || arg == "--num-threads") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
max_thread = atoi(argv[i]);
|
2023-04-08 00:09:18 +02:00
|
|
|
} else {
|
|
|
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
|
|
|
quantize_stats_print_usage(argc, argv);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (invalid_param) {
|
|
|
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
|
|
|
quantize_stats_print_usage(argc, argv);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2023-05-01 18:23:47 +02:00
|
|
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
|
|
|
|
2023-04-08 00:09:18 +02:00
|
|
|
// load the model
|
|
|
|
fprintf(stderr, "Loading model\n");
|
|
|
|
|
|
|
|
const int64_t t_main_start_us = ggml_time_us();
|
2023-06-24 10:47:58 +02:00
|
|
|
llama_model * model;
|
2023-04-08 00:09:18 +02:00
|
|
|
llama_context * ctx;
|
|
|
|
|
|
|
|
{
|
|
|
|
auto lparams = llama_context_default_params();
|
|
|
|
|
|
|
|
lparams.n_ctx = 256;
|
|
|
|
lparams.seed = 1;
|
|
|
|
lparams.f16_kv = false;
|
|
|
|
lparams.use_mlock = false;
|
|
|
|
|
2023-06-24 10:47:58 +02:00
|
|
|
model = llama_load_model_from_file(params.model.c_str(), lparams);
|
2023-04-08 00:09:18 +02:00
|
|
|
|
2023-06-24 10:47:58 +02:00
|
|
|
if (model == NULL) {
|
2023-04-08 00:09:18 +02:00
|
|
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
|
|
return 1;
|
|
|
|
}
|
2023-06-24 10:47:58 +02:00
|
|
|
|
|
|
|
ctx = llama_new_context_with_model(model, lparams);
|
|
|
|
|
|
|
|
if (ctx == NULL) {
|
|
|
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
|
|
llama_free_model(model);
|
|
|
|
return 1;
|
|
|
|
}
|
2023-04-08 00:09:18 +02:00
|
|
|
}
|
|
|
|
|
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
|
|
|
const auto &tensors = llama_internal_get_tensor_map(ctx);
|
2023-04-08 00:09:18 +02:00
|
|
|
|
|
|
|
// check layer tensors
|
|
|
|
int included_layers = 0;
|
|
|
|
int64_t max_nelements = 0;
|
|
|
|
bool is_f16 = false;
|
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
|
|
|
for (const auto& kv_tensor : tensors) {
|
2023-04-08 00:09:18 +02:00
|
|
|
if (!layer_included(params, kv_tensor.first)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (params.verbose) {
|
2023-04-14 20:05:37 +02:00
|
|
|
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
|
2023-04-08 00:09:18 +02:00
|
|
|
}
|
|
|
|
if (kv_tensor.second->type == GGML_TYPE_F16) {
|
|
|
|
is_f16 = true;
|
|
|
|
} else if (kv_tensor.second->type != GGML_TYPE_F32) {
|
|
|
|
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
|
|
|
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
|
|
|
llama_free(ctx);
|
2023-06-24 10:47:58 +02:00
|
|
|
llama_free_model(model);
|
2023-04-08 00:09:18 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
included_layers++;
|
|
|
|
max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_f16) {
|
|
|
|
printf("note: source model is f16\n");
|
|
|
|
}
|
|
|
|
printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
|
|
|
|
// allocate scratch space
|
2023-04-20 19:42:27 +02:00
|
|
|
std::vector<float> input_scratch;
|
|
|
|
std::vector<char> quantized_scratch;
|
|
|
|
std::vector<float> output_scratch;
|
2023-04-08 00:09:18 +02:00
|
|
|
|
|
|
|
// loop throught quantization types
|
|
|
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
2023-04-14 20:05:37 +02:00
|
|
|
const ggml_type type = (ggml_type) i;
|
2023-04-08 00:09:18 +02:00
|
|
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
|
|
|
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
|
|
|
if (params.verbose) {
|
2023-04-14 20:05:37 +02:00
|
|
|
printf("testing %s ...\n", ggml_type_name(type));
|
2023-04-08 00:09:18 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
error_stats global_stats {};
|
|
|
|
|
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
|
|
|
for (const auto& kv_tensor : tensors) {
|
2023-04-08 00:09:18 +02:00
|
|
|
if (!layer_included(params, kv_tensor.first)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (params.verbose) {
|
|
|
|
printf(" %s ...\n", kv_tensor.first.c_str());
|
|
|
|
}
|
2023-04-14 20:05:37 +02:00
|
|
|
std::string layer_name { ggml_type_name(type) };
|
2023-04-08 00:09:18 +02:00
|
|
|
layer_name += "::" + kv_tensor.first;
|
|
|
|
test_roundtrip_on_layer(
|
|
|
|
layer_name,
|
|
|
|
params.per_layer_stats,
|
|
|
|
qfns,
|
|
|
|
params.reference,
|
|
|
|
kv_tensor.second,
|
2023-04-20 19:42:27 +02:00
|
|
|
input_scratch,
|
|
|
|
quantized_scratch,
|
|
|
|
output_scratch,
|
|
|
|
global_stats,
|
|
|
|
max_thread
|
2023-04-08 00:09:18 +02:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2023-04-14 20:05:37 +02:00
|
|
|
print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
|
2023-04-08 00:09:18 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
llama_free(ctx);
|
2023-06-24 10:47:58 +02:00
|
|
|
llama_free_model(model);
|
2023-04-08 00:09:18 +02:00
|
|
|
// report timing
|
|
|
|
{
|
|
|
|
const int64_t t_main_end_us = ggml_time_us();
|
|
|
|
|
|
|
|
printf("\n");
|
|
|
|
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|