llama.cpp/quantize.cpp

#include "ggml.h"
#include "llama.h"

#include <cstdio>
#include <string>

const int QK = 32;

// usage:
//  ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
    ggml_time_init();

    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
        fprintf(stderr, "  type = 2 - q4_0\n");
        fprintf(stderr, "  type = 3 - q4_1\n");
        return 1;
    }

    // needed to initialize f16 tables
    {
        struct ggml_init_params params = { 0, NULL };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }

    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];

    const int itype = atoi(argv[3]);

    const int64_t t_main_start_us = ggml_time_us();

    int64_t t_quantize_us = 0;

    // load the model
    {
        const int64_t t_start_us = ggml_time_us();

        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }

        t_quantize_us = ggml_time_us() - t_start_us;
    }

    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();

        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
    }

    return 0;
}
Initial release 2023-03-10 19:40:58 +01:00			`#include "ggml.h"`
Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 06:32:36 +01:00			`#include "llama.h"`
Initial release 2023-03-10 19:40:58 +01:00
			`#include <cstdio>`
			`#include <string>`
We could use std::unordered_map over std::map (#305) * Improve performance by changing std::map to std::unordered_map and std::map<id, token> id_to_token; to std::vector<token> id_to_token; * fix last commit on gpt_vocab_init add vocab.id_to_token.resize(vocab.token_to_id.size()); * Removed include <map> * Nest struct token score inside gpt_vocab * renamed token to tok 2023-03-21 18:21:50 +01:00
Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 06:32:36 +01:00			`const int QK = 32;`
Initial release 2023-03-10 19:40:58 +01:00
			`// usage:`
			`// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type`
			`//`
			`int main(int argc, char ** argv) {`
Windows fixes (#31) * Apply fixes suggested to build on windows Issue: https://github.com/ggerganov/llama.cpp/issues/22 * Remove unsupported VLAs * MSVC: Remove features that are only available on MSVC C++20. * Fix zero initialization of the other fields. * Change the use of vector for stack allocations. 2023-03-12 21:15:00 +01:00			`ggml_time_init();`
Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 06:32:36 +01:00
Initial release 2023-03-10 19:40:58 +01:00			`if (argc != 4) {`
			`fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);`
			`fprintf(stderr, " type = 2 - q4_0\n");`
			`fprintf(stderr, " type = 3 - q4_1\n");`
			`return 1;`
			`}`

Fix un-initialized FP16 tables on x86 (#15, #2) 2023-03-11 16:40:14 +01:00			`// needed to initialize f16 tables`
			`{`
			`struct ggml_init_params params = { 0, NULL };`
			`struct ggml_context * ctx = ggml_init(params);`
			`ggml_free(ctx);`
			`}`

Initial release 2023-03-10 19:40:58 +01:00			`const std::string fname_inp = argv[1];`
			`const std::string fname_out = argv[2];`

			`const int itype = atoi(argv[3]);`

			`const int64_t t_main_start_us = ggml_time_us();`

			`int64_t t_quantize_us = 0;`

			`// load the model`
			`{`
			`const int64_t t_start_us = ggml_time_us();`

Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 2023-03-22 06:32:36 +01:00			`if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {`
Initial release 2023-03-10 19:40:58 +01:00			`fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());`
			`return 1;`
			`}`

			`t_quantize_us = ggml_time_us() - t_start_us;`
			`}`

			`// report timing`
			`{`
			`const int64_t t_main_end_us = ggml_time_us();`

			`printf("\n");`
			`printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);`
			`printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);`
			`}`

			`return 0;`
			`}`